diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h index 6eb169d8fff0501d2d31e60030128e2527bb8c71..89ce00fe87428ac6e449cab85e6919174dcbbed5 100644 --- a/paddle/fluid/distributed/collective/HCCLTools.h +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -94,7 +94,7 @@ class NPUEventManager { PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 197761dc3c3234de53ed902676f95a1ed00f0238..c00b081438c51feacedcf3eb9dfbef503dc8a0c8 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -104,7 +104,7 @@ class EventManager { bool DeviceId() const { return device_index_; } gpuEvent_t GetRawCudaEvent() const { return event_; } - void Record(const paddle::platform::CUDADeviceContext& ctx) { + void Record(const phi::GPUContext& ctx) { auto device_index = ctx.GetPlace().device; if (!is_created_) { CreateEvent(device_index); @@ -112,7 +112,7 @@ class EventManager { PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); @@ -157,13 +157,13 @@ class EventManager { } } - void Block(const paddle::platform::CUDADeviceContext& ctx) const { + void Block(const phi::GPUContext& ctx) const { if (is_created_) { auto device_index = ctx.GetPlace().device; PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 81db9b94da93e8607045e3916d0fff14ce8ca5d6..d776f62373e43bc672ba85fdb316a0dc28a43f88 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -31,10 +31,10 @@ namespace distributed { void SyncDefaultStream( const std::vector& places, - std::vector& ncclEvents, // NOLINT - std::vector>& dev_ctx) { // NOLINT + std::vector& ncclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT for (size_t i = 0; i < places.size(); ++i) { - auto* default_ctx = static_cast( + auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); ncclEvents[i].Record(*default_ctx); ncclEvents[i].Block(*dev_ctx[i]); @@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs( void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { for (size_t i = 0; i < places_.size(); ++i) { - auto* default_ctx = static_cast( + auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places_[i])); default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent()); } @@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); - std::vector> dev_ctx; + std::vector> dev_ctx; dev_ctx.resize(places.size()); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); @@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( for (size_t i = 0; i < places.size(); ++i) { platform::CUDADeviceGuard guard(places[i]); nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id); - dev_ctx[i].reset(new CUDADeviceContext(places[i])); + dev_ctx[i].reset(new phi::GPUContext(places[i])); } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 4dd44771d15d584cc148161325291000bae4800c..5adb6867eb8ef86fd78b474adc6591e5bd839168 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -45,7 +45,6 @@ namespace paddle { namespace distributed { using Place = paddle::platform::Place; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; class ProcessGroupNCCL : public ProcessGroup { public: @@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup { std::unordered_map> places_to_events_; - std::unordered_map>> + std::unordered_map>> places_to_ctx_; std::set used_place_ids_; diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 5f137c4d0af05379dd4f9d8f3e087cf70fa26ae5..8f4466f7baa738870439355395838781fc1413c7 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context, void EagerGroup::ConcatTensors(const platform::Place &place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); ConcatTensorsWithType( *default_ctx, dense_tensors_, &dense_contents_, dtype_); @@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { void EagerGroup::SplitTensors(const platform::Place &place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); SplitTensorsWithType( *default_ctx, &dense_contents_, &dense_tensors_, dtype_); @@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); if (platform::is_gpu_place(inner_place_)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - dev_ctx = static_cast( + dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(inner_place_)); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index e641d6311c6ce05d02929373513fcbfb4580e254..0b46369b970ab1c3bbcff46be93b119ed4e75ae1 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, VLOG(3) << "Loading data for GPU."; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - dynamic_cast(pool.Get(place)); + auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; memory::Copy(gpu_place, static_cast(input_tensor_ptr), diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc index 3ed4277c61edd41b745e5e6273985c24561f5d34..b98e85f9c23e5338a96eb5cca6a1c33484e26ed0 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var, char* temp_ptr = new char[tensor->numel() * framework::DataTypeSize(tensor->dtype())]; // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), temp_ptr, @@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var, char* temp_ptr = new char[tensor->numel() * framework::DataTypeSize(tensor->dtype())]; // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), temp_ptr, @@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var, framework::DataTypeSize(tensor->dtype())]; // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), @@ -311,8 +308,7 @@ void DeserializeSelectedRows( unsigned long data_len; // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(temp_ptr, data_len); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index 91a20a432a3f4f57eeb562ad628e26647bbfdc61..84ef0b02bedcde4f8e6b66bf9a419d96dd9aefad 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx, std::vector temp; temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype())); char* temp_ptr = temp.data(); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(platform::CPUPlace(), temp_ptr, tensor->place(), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 1409119daf1d39d104e09ae6936d236d16b08947..002b83307638b7ea6333d0157bd835b983896f43 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (expected_kernel_place == paddle::platform::CUDAPlace()) { - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_place)); + auto* dev_ctx = + dynamic_cast(pool.Get(expected_kernel_place)); if (!dev_ctx) { PADDLE_THROW(paddle::platform::errors::Fatal( "Cannot convert device_context to CUDADeviceContext." "This indicates backend mismatch." "Pleas double check your expected place")); } - ScaleDeviceDispatch( - *dense_tensor.get(), - *dev_ctx, - scale, - bias, - bias_after_scale, - dense_out.get()); + ScaleDeviceDispatch(*dense_tensor.get(), + *dev_ctx, + scale, + bias, + bias_after_scale, + dense_out.get()); #endif } else { PADDLE_THROW(paddle::platform::errors::Fatal( diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index 6b2b9c9f34a6d00b1815886c2a5c66ed98747a45..f8c06a5afff121bc7e770a94b1d157ae99682274 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - paddle::framework::details::tensor_check< - paddle::platform::CUDADeviceContext>( + paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index f275e3f0bf1ad77663642f6c5b5cc19ac9acf630..6441ce1e788435b0d657702b74b69456fd639727 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(place, mutable_x, @@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); auto* x_tensor = X->MutableVar()->GetMutable(); @@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); std::vector x_src_data(MLP_M * MLP_N, MLP_X_VAL); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 05ab86028dae4b8d4a261cf606e5440dac88f76b..b41938d4856ddf678d13a6d5f2cef5672bd809b4 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr& X, if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), @@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue( if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h index 8540fc7e10de106aaa3f6010b315820ceb057831..0e62e5c2da693b42ae3ba9493b1d3ad6c8375b02 100644 --- a/paddle/fluid/eager/tests/test_utils.h +++ b/paddle/fluid/eager/tests/test_utils.h @@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target, #ifdef PADDLE_WITH_CUDA paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(paddle::platform::CUDAPlace())); + auto* dev_ctx = + dynamic_cast(pool.Get(paddle::platform::CUDAPlace())); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), @@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target, #ifdef PADDLE_WITH_CUDA paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(paddle::platform::CUDAPlace())); + auto* dev_ctx = + dynamic_cast(pool.Get(paddle::platform::CUDAPlace())); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 94e7918e800ef0d7bc8948e3ece381d90c64c249..cd76747c03599d134211243eacf86aef6565813f 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT( paddle::framework::OpKernelTestProtoAndCheckerMaker); REGISTER_OP_CPU_KERNEL(test_op, paddle::framework::TestKernel); -REGISTER_OP_CUDA_KERNEL( - test_op, - paddle::framework::TestKernel); +REGISTER_OP_CUDA_KERNEL(test_op, + paddle::framework::TestKernel); static void BuildVar(const std::string& param_name, std::initializer_list arguments, diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 8ffb58f945156463f614c840dbc9d90acb7e9cc9..4b5177aaa45c80e69ab9b4a805319fc6358b5967 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place, const std::vector& infos) { place_ = place; - stream_ = dynamic_cast( + stream_ = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {} void MiniBatchGpuPack::reset(const paddle::platform::Place& place) { place_ = place; - stream_ = dynamic_cast( + stream_ = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); ins_num_ = 0; diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index d144673d62d6bfdcdc7a3297977c381a73b5efaa..681fb1fdb295ce01117fe9d46083f9282ab091dd 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset( const int float_slot_size, const UsedSlotGpuType *used_slots) { auto stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(this->place_)) ->stream(); FillSlotValueOffsetKernel<<( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(this->place_)) ->stream(); diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9333e246c68bc35fb7e1e5ebed5870f4069156ad..59d20306c665a52ca98151744f8010eb007194bc 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -103,8 +103,8 @@ struct CastDataType { CastDataTypeFunctor()); #if defined(__NVCC__) || defined(__HIPCC__) } else if (platform::is_gpu_place(in_.place())) { - platform::Transform trans; - auto* context = static_cast(ctx_); + platform::Transform trans; + auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index ed5b7fc692b7b1536a26a6e5ab245d9bedc58df0..8490afd69d9ea73127a55e3b413e7b589496e365 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -19,7 +19,7 @@ limitations under the License. */ TEST(DataTypeTransform, GPUTransform) { auto cpu_place = paddle::platform::CPUPlace(); auto gpu_place = paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 26ad71bafe6ff871cce937dc1fd40d11008a3aec..154bf2b354e1a4d4a0650b985453dcd9c9863a98 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -105,7 +105,7 @@ struct TestBroadcastOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); place_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new phi::GPUContext(p)); } nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); #else diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index dafeb5cdb263a82c77bd730abc43d7a76aef08f0..1e384143a3c079b706513054bd16bd17f6183a56 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( gc_(gc) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - dev_ctx_ = reinterpret_cast( + dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard(place.device); diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index a30e80b204d9b6ba9f9666102b58d58a1cda6f5f..0a92269c50ad2d645e47d4f24c9fac7e1822251c 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase { GarbageCollector *gc_; // not own std::vector vars_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDADeviceContext *dev_ctx_{nullptr}; + phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif }; diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 9cc1929e19ae8bf39689317a186db7b52a8cb673..45d8939f788a0830efbf39542a4dff10ad27cc1d 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -58,7 +58,7 @@ struct TestGatherOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new phi::GPUContext(p)); } #else PADDLE_THROW( diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 767f7b1e48b438cb52689c04b1428d4ec3b8ec5e..ea29271261073944454c9ac9122a79361ddad932 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, if (platform::is_gpu_place(tensor->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - tensor_check( - op_type, var_name, *tensor, place); + tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.", diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 59bbef3a095e2578828fc12c51cce78d0731e683..d91225a81416195fc493a8a72a36b32e9eda713c 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value, template <> template -void TensorCheckerVisitor::apply( +void TensorCheckerVisitor::apply( typename std::enable_if< std::is_floating_point::value || std::is_same>::value || @@ -143,7 +143,7 @@ void TensorCheckerVisitor::apply( const { int print_num = 3; - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor_.place())); int dev_id = tensor_.place().device; PADDLE_ENFORCE_EQ( @@ -226,13 +226,13 @@ void TensorCheckerVisitor::apply( } template <> -void tensor_check(const std::string& op_type, - const std::string& var_name, - const framework::Tensor& tensor, - const platform::Place& place) { +void tensor_check(const std::string& op_type, + const std::string& var_name, + const framework::Tensor& tensor, + const platform::Place& place) { std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap); - TensorCheckerVisitor vistor( + TensorCheckerVisitor vistor( op_type, var_name, tensor, place); VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 3d8cb208017d23074609fede9924cd00b846f09e..82f09f51c23e1b9cce5e498ac58edb00c97da882 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { dev_ctx.second->Wait(); } } else { - auto stream = - static_cast(waited_ctx)->stream(); + auto stream = static_cast(waited_ctx)->stream(); for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); @@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = - static_cast(dev_ctxes_.at(place)) - ->stream(); + static_cast(dev_ctxes_.at(place))->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = - static_cast(pool.Get(place)) - ->stream(); + static_cast(pool.Get(place))->stream(); platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto stream = static_cast( + auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP @@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; - auto *cuda_dev_ctx = static_cast(p.second); - VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; + auto *cuda_dev_ctx = static_cast(p.second); + VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, callback(); } else { auto *ctx = dev_ctxes_.at(p); - auto *cuda_ctx = static_cast(ctx); + auto *cuda_ctx = static_cast(ctx); cuda_ctx->RecordEvent(events_.at(p.device), callback); } #else diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 0d957bf81306f199a8eddde35e2e7821b9690765..ad7888c065402ccc1d6dd03b53c34318f77bc03d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -69,7 +69,7 @@ struct TestReduceOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new p::phi::GPUContext(p)); } nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); #else diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index f0c152c34d388c552f1d51f04044aaa1e4ff9328..b453e7c4a813e72d4eea832505e1bd98e5edf4f0 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -77,7 +77,7 @@ struct ScaleLossGradFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) OutT cast_coeff = static_cast(coeff_); - auto stream = static_cast(ctx_)->stream(); + auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, out_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index c4bc5905aca5b5c6c5c82ee0cc172f85fc5bb826..5f46906cf8e82376dd16061d32f3cb207ec6cc20 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, const int hidden_size, const int expand_embed_dim, const int64_t total_length) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); @@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, const int64_t* gpu_len, int slot_num, int total_len) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); #ifdef PADDLE_WITH_HIP @@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, const int expand_embed_dim, const int64_t total_length, const int batch_size) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 297f4cb4796a7b1fd5a1690e7bfea7a2931e1b5a..c4cec547bd8d14b1b331136e574112ba51608488 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -223,10 +223,10 @@ class AfsManager { delete read_stream; } int PopenBidirectionalInternal(const char* command, - FILE*& fp_read, // NOLINT - FILE*& fp_write, - pid_t& pid, // NOLINT - bool read, // NOLINT + FILE*& fp_read, // NOLINT + FILE*& fp_write, // NOLINT + pid_t& pid, // NOLINT + bool read, // NOLINT bool write) { std::lock_guard g(g_flock); int fd_read[2]; @@ -440,10 +440,9 @@ class BoxWrapper { std::vector stream_list; for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) { VLOG(3) << "before get context i[" << i << "]"; - platform::CUDADeviceContext* context = - dynamic_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(i))); + phi::GPUContext* context = dynamic_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(i))); stream_list_[i] = context->stream(); stream_list.push_back(&stream_list_[i]); } diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index ccc3575c42a1f55176ac192b176dea407a95c5fc..e57a02d72999c153b5a68279c5fe553593d9bd2f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -300,7 +300,7 @@ void AccessorWrapper::CopyForPullImpl( const int64_t total_length, int* gpu_dim, int feature_value_size) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); @@ -333,7 +333,7 @@ void AccessorWrapper::CopyForPushImpl( size_t grad_value_size, std::vector& slot_vector, std::vector& slot_mf_dim_vector) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 0e806fdb5f50960c3ec8e562062617643bd72e0d..36b789bdd11084a75e9b68779b9fa69508146e26 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, const int64_t* gpu_len, int slot_num, int total_len) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index c1f8041cc1eca34b858608ffb77598ce095d0b4f..77a666a24d9eab27e5bf5d21f175c70f6af1855c 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( : GarbageCollector(place, max_memory_size) {} void DefaultStreamGarbageCollector::Wait() const { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); + static_cast(this->dev_ctx_)->WaitStreamCallback(); } void DefaultStreamGarbageCollector::ClearCallback( const std::function &callback) { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); + static_cast(this->dev_ctx_)->AddStreamCallback(callback); } StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index 73139dee6e0f943ec07bc04fcdd5aa09e5839fee..f5c226631e0d656303738d8006edc0d5cd0f409b 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope, char* temp_ptr = temp.data(); float* temp_ptr_float = reinterpret_cast(temp_ptr); temp_ptr_float[0] = micro_id; - auto stream = - reinterpret_cast(*dev_ctx).stream(); + auto stream = reinterpret_cast(*dev_ctx).stream(); memory::Copy( place, tensor_data, diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index da52af0faf39f36c3455c16264b1742fa44be347..0afeecd06b0d2a2f39f6a344a9819f6fb8b79744 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, } } #ifdef PADDLE_WITH_CUDA - auto* dev_ctx = static_cast( + auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(context->event_, dev_ctx->stream())); diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index ce5f4d743c69637771e850a1993a94d864722c2b..690dea51632ea32763905ee26f16ca4f4873a596 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name, device_code.SetWorkloadPerThread(1); device_code.Launch(n, &args); - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); dev_ctx->Wait(); diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc index cb77542e262589390f93ffb649878b1b554c8c6d..c3c3581a6a785b248ec393d6d9a1c380bea78745 100644 --- a/paddle/fluid/framework/mixed_vector.cc +++ b/paddle/fluid/framework/mixed_vector.cc @@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector *cpu_, size_t *gpu_memory_size_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // COPY GPU Data To CPU - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get((*gpu_)->place())); auto stream = dev_ctx->stream(); void *src = (*gpu_)->ptr(); @@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory::Alloc(place, *gpu_memory_size_); void *dst = (*gpu_)->ptr(); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 0eaf5dd69a5ee6da7f47455c7fdc863459113490..61d256ffb2283bf46d7c2a6487229623e95bd54f 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) { } gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { - return reinterpret_cast( + return reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); } diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 3680f0aa900c6905075f918b622ee6f82c96a73e..4b72d6bea34d8496a2cc8e2ccb099d130679d051 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { platform::RecordEvent record( "RecordStreamForGC", platform::TracerEventType::UserDefined, 10); - gpuStream_t stream = reinterpret_cast( - instr.DeviceContext()) - .stream(); + gpuStream_t stream = + reinterpret_cast(instr.DeviceContext()).stream(); auto TensorRecordStream = [&stream](Tensor& tensor) { auto allocation = tensor.Holder(); if (allocation == nullptr) { diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc index fa0528d488297b3d868bd7658f954204ca47a446..9ef577f62855f7f648295d0e8c1657c3c151dfad 100644 --- a/paddle/fluid/framework/op_registry_test.cc +++ b/paddle/fluid/framework/op_registry_test.cc @@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::OpKernelTest); REGISTER_OP_CUDA_KERNEL( - op_with_kernel, - paddle::framework::OpKernelTest); + op_with_kernel, paddle::framework::OpKernelTest); TEST(OperatorRegistrar, CPU) { paddle::framework::proto::OpDesc op_desc; @@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) { } static int op_test_value = 0; -using paddle::platform::CUDADeviceContext; using paddle::platform::DeviceContext; using phi::CPUContext; +using phi::GPUContext; namespace paddle { namespace framework { @@ -301,7 +299,7 @@ class OpMultiKernelTest : public paddle::framework::OpKernel { }; template -class OpMultiKernelTest +class OpMultiKernelTest : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { @@ -325,7 +323,7 @@ class OpMultiKernelTest2 }; template -class OpMultiKernelTest2 +class OpMultiKernelTest2 : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { @@ -351,12 +349,12 @@ REGISTER_OP_KERNEL( op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, - paddle::framework::OpMultiKernelTest); + paddle::framework::OpMultiKernelTest); REGISTER_OP_KERNEL( op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, - paddle::framework::OpMultiKernelTest2); + paddle::framework::OpMultiKernelTest2); TEST(OperatorRegistrar, OpWithMultiKernel) { paddle::framework::proto::OpDesc op_desc; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2568a459f315187e8ce2eecf3a50a1e121069c0f..cb6b2d832bf047bf71461d3cbf1e6d36800a04e0 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -416,13 +416,12 @@ class ExecutionContext { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - const inline platform::CUDADeviceContext& cuda_device_context() const { + const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, platform::errors::PreconditionNotMet( "Current device context place is not GPUPlace.")); - return *reinterpret_cast( - &device_context_); + return *reinterpret_cast(&device_context_); } #endif diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4ad966887f399fe050c7c41c1a78ee19c432590f..26150b2d04b04359c43e9b3d1d0f90faf87ab137 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices( nccl_ctxs->WaitAll(); } else { auto src_place = member_->places_[0]; - auto src_dev_ctx = static_cast( + auto src_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(src_place)); auto sizeof_dtype = framework::SizeOfType(dtype) * numel; for (size_t i = 1; i < member_->places_.size(); ++i) { auto dst_place = member_->places_[i]; - auto dst_dev_ctx = static_cast( + auto dst_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(dst_place)); src_dev_ctx->Wait(); dst_dev_ctx->Wait(); @@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { global_scope, member_->places_); auto &pool = platform::DeviceContextPool::Instance(); for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); + auto *dev_ctx = + static_cast(pool.Get(member_->places_[dev_id])); auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); dev_ctx->set_nccl_comm(nccl_ctx.comm()); } diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 6c8e825157973592918cb2b67df25eea656acad6..050a51a0f1077636abb0267528973643d7a24ff9 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,7 @@ struct ConvertToPhiContext { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -struct ConvertToPhiContext { +struct ConvertToPhiContext { using TYPE = phi::GPUContext; }; #endif diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index dbb549efa2519d48136648ba0cfbba2c4d82cc9a..f7f05da6340135550bae85757983ab3d257b016d 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src, "place is %s, context place is %s.", src_gpu_place, ctx_gpu_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && // NOLINT @@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src, "destination place is %s, context place is %s.", dst_gpu_place, ctx_gpu_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_gpu_place(src_place) && // NOLINT @@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src, "device context GPU number is %d.", src_gpu_place.device, ctx_gpu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } @@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src, "device context GPU number is %d.", dst_gpu_place.device, ctx_gpu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } @@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src, platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); @@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); - auto& gpu_dev_ctx = - static_cast(dev_ctx); + auto& gpu_dev_ctx = static_cast(dev_ctx); platform::CPUPlace cpu; uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { @@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { platform::CUDAPlace(dl_tensor.device.device_id); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(*ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(*ctx).stream()); } #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 79532172571e38f05a3c66444e0fd2119b9416cc..b1bba0f7c35f81b374f84c9fc90d6edef863541a 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -164,13 +164,12 @@ void TensorFromArray(const T* src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -242,13 +241,12 @@ void TensorFromVector(const std::vector& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector& src, } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src.place(), - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src.place(), + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) @@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src.place(), - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src.place(), + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 74454a5a09b7af3fb5d9ae0a4f1dc117cf1010ed..36be5cde506f37a6d03b5cbd0467d2b517675adc 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) { // CPU Tensor to GPU Tensor auto gpu_place = new platform::CUDAPlace(0); - platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); @@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) { // Copy to GPUTensor gpu_tensor.Resize(phi::make_ddim({3, 3})); auto gpu_place = new paddle::platform::CUDAPlace(); - paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); @@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; - paddle::platform::CUDADeviceContext gpu_ctx(place); + phi::GPUContext gpu_ctx(place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, gpu_ctx.stream()) .get()); @@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) { #endif } -TEST(TensorToVector, Tensor_bool) { -{ - paddle::framework::Tensor src; - bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); - for (int i = 0; i < 3 * 3; ++i) { - src_ptr[i] = static_cast(i % 2); - } +TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src; +bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); +for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); +} - paddle::platform::CPUPlace place; - std::vector dst; - paddle::framework::TensorToVector(src, &dst); +paddle::platform::CPUPlace place; +std::vector dst; +paddle::framework::TensorToVector(src, &dst); - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_ptr[i], dst[i]); - } +for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); } +} // namespace framework #ifdef PADDLE_WITH_CUDA { @@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) { }; paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; - paddle::platform::CUDADeviceContext gpu_ctx(place); + phi::GPUContext gpu_ctx(place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, gpu_ctx.stream()) .get()); @@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) { } } #endif -} +} // namespace paddle TEST(TensorFromDLPack, Tensor) { { @@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) { Tensor dst_tensor; auto gpu_place = new platform::CUDAPlace(); - platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index b948a191df73aed91be9a355d1cbd32396c25106..c9d3d2591d000c5442504aaaffb66b1b31d50572 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src, auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); auto nccl_dtype = platform::ToNCCLDataType(dtype); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); bool use_calc_stream = (dev_ctx->stream() == stream); @@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src, int ring_id, bool use_calc_stream) { const auto &place = GetVarPlace(src); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); platform::NCCLComm *comm = platform::NCCLCommContext::Instance().Get(ring_id, place); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index b656da34fb6a6909a9f0a6e1f6e0f68eb4736955..e6e156fa61c1435f375841b7d718e16afe4f470a 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -122,10 +122,9 @@ class TensorAddFunctor #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void operator()(const platform::CUDAPlace& place) const { - platform::CUDADeviceContext* ctx = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = phi::funcs::GetBlas(*ctx); + phi::GPUContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = phi::funcs::GetBlas(*ctx); blas.AXPY(numel_, 1., x_, y_); } #else @@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) { if (data_type == framework::proto::VarType::FP16) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) { if (data_type == framework::proto::VarType::BF16) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); @@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); @@ -614,8 +613,8 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 99c4a02e82b87fd37bcb2dcff706b5a97f771344..94ac86e97e1573f5f1d9b78d56f40f9f987b18d7 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -85,7 +85,7 @@ void NCCLParallelContext::Init() { VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " ring id: " << ring_id; - // it will assign nccl_comm in CUDADeviceContext within ring_id + // it will assign nccl_comm in phi::GPUContext within ring_id platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, @@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " ring id: " << ring_id; - // it will assign nccl_comm in CUDADeviceContext within ring_id + // it will assign nccl_comm in phi::GPUContext within ring_id platform::NCCLCommContext::Instance().CreateComm( &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); @@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) { ring_id, compute_events_.size())); - auto compute_stream = static_cast( + auto compute_stream = static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); auto comm_stream = @@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { ring_id, comm_events_.size())); - auto compute_stream = static_cast( + auto compute_stream = static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); auto comm_stream = @@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { } void NCCLParallelContext::SynchronizeCompute() { - auto *compute_dev_ctx = static_cast( + auto *compute_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); compute_dev_ctx->Wait(); } diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 468263e7be7ea58f0e94ed71c1b2066642e8a538..1c3165a4538a28ca8d8b92469638351378281c8e 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ConcatTensorsWithType( - static_cast(context), - dense_tensors_, - &dense_contents_, - dtype_); + ConcatTensorsWithType(static_cast(context), + dense_tensors_, + &dense_contents_, + dtype_); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat grad tensors since it's not compiled with NCCL," @@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - SplitTensorsWithType( - static_cast(context), - &dense_contents_, - &dense_tensors_, - dtype_); + SplitTensorsWithType(static_cast(context), + &dense_contents_, + &dense_tensors_, + dtype_); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split grad tensor since it's not compiled with NCCL," diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu index 5b29e568089ff46efdf25fbff43b86e7b646fa70..a3f840f38bfad1ea4d328cb00be17edfc956a10c 100644 --- a/paddle/fluid/imperative/reducer.cu +++ b/paddle/fluid/imperative/reducer.cu @@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor, "Unsupport BF16 in DataParallel for now")); } framework::VisitDataTypeForHIP( - dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + dtype_, DivNRanksForAllReduce(tensor, nranks, context)); #else - framework::VisitDataType(dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + framework::VisitDataType( + dtype_, DivNRanksForAllReduce(tensor, nranks, context)); #endif } #endif diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc index 67059916d03176c01d5f7f68b1c358d2533846f2..597a9a64669909fdae9aefbc7d178afe504e242e 100644 --- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc +++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc @@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { void AllReduceByStream(int local_rank, int device_id) { int data_size = 32; const auto& place = platform::CUDAPlace(device_id); - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); // heter_parallel_ctx imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id); diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 89938d2d7a2a2b202df3de4066878577778800b7..13843ddbe5c68bc2cf1e7acb53913b86d187ec4f 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) { int data_size = 4; float test_data = 7; const auto& place = platform::CUDAPlace(device_id); - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); imperative::NCCLParallelContext npc(GetStrategy(local_rank), place); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8971448071f1d2d7521889ca2f30ad9e211d1bcc..bde92c13b4cb206781390382746b2f9722ae4677 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, "Only one choice can be made between CPU and XPU.")); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - static_cast(pool.Get(place)); + auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; memory::Copy(dst_gpu_place, static_cast(input_ptr), @@ -283,7 +282,7 @@ bool AnalysisPredictor::Init( // NOTE: If the external_stream equals to global_device_contexts's stream, // then fallback. auto global_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); if (predictor_stream_ != global_stream) { @@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); auto gpu_place = place_; - auto *dev_ctx = static_cast( - pool.Get(gpu_place)); + auto *dev_ctx = static_cast(pool.Get(gpu_place)); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); #else @@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) { auto *pred = dynamic_cast(p->predictor_.get()); paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto *dev_ctx = reinterpret_cast( - pool.Get(pred->place_)); + auto *dev_ctx = reinterpret_cast(pool.Get(pred->place_)); cudaStreamSynchronize(dev_ctx->stream()); #endif } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 34dade3628a468c5cad6fa8df478c7bc68d7050e..2ba806a052977553f7d99eb90a8af69999057eb9 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - static_cast(pool.Get(place_)); + auto *dev_ctx = static_cast(pool.Get(place_)); auto dst_gpu_place = place_; memory::Copy(dst_gpu_place, static_cast(input_ptr), diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc index 171db0807e7de0ade3bd5303dd44ec82d4841535..51b27f8ca3ac07430cb0446ada11519f0ae08eff 100644 --- a/paddle/fluid/inference/api/paddle_infer_contrib.cc +++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc @@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst, paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); paddle::platform::CUDAPlace gpu_place(dst.device_); - auto* dev_ctx = static_cast( - pool.Get(gpu_place)); + auto* dev_ctx = static_cast(pool.Get(gpu_place)); if (src.place() == PlaceType::kCPU) { paddle::memory::Copy(gpu_place, diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index e9ffbbd4494f6f42a3046ccaa4810e2d5f22bb5a..454cd49d3ab0489a66513ddb9f7bec27f9bd9adc 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place, } else if (platform::is_gpu_place(dst_place) && platform::is_gpu_place(src_place)) { auto gpu_place = src_place; - memory::Copy( - gpu_place, - dst_data, - gpu_place, - src_data, - size, - static_cast(ctx).stream()); + memory::Copy(gpu_place, + dst_data, + gpu_place, + src_data, + size, + static_cast(ctx).stream()); } #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 45b9d222c4c3ec62bed2928dc5916689dc3d1528..e4054c5df675487a8b37359251dc8ad9309e37ac 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) { framework::Scope scope; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index 43e1d8770c37c3429646747b4c5721214a5a4440..eea51e8ff1e130ae18be72d0f70d131408537cca 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - platform::GpuStreamSync( - static_cast(ctx).stream()); + platform::GpuStreamSync(static_cast(ctx).stream()); } #endif std::vector result; diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc index d770ef5478abb5d17d2975c71ea52e26d601f6ac..06555114164c0111994880f427afb09e505ba520 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) { TEST(EngineIOConverterTester, DefaultGPU) { platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); IOConverterTester(ctx); } diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 2cdf362330625cba5826c741d76c85c4d2443de4..9b80aeb1d493887a4326a9025fd8c4a72b26abfe 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -124,7 +124,7 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDADeviceContext ctx(place_); + phi::GPUContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); @@ -172,7 +172,7 @@ class TRTConvertValidation { "But received batch_size:%d, max_batch_size_:%d", batch_size, max_batch_size_)); - platform::CUDADeviceContext ctx(place_); + phi::GPUContext ctx(place_); op_->Run(scope_, place_); cudaStreamSynchronize(stream_); std::vector input_output_names; diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 499c21723fe1a9bdc7ebf75528c758a83afdf1df..9602e6c87903a68e65b2ec81513d10275f827573 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue( TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); - auto *device_ctx = static_cast( + auto *device_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); - const platform::CUDADeviceContext &dev_ctx = *device_ctx; + const phi::GPUContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, @@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue( TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); - auto *device_ctx = static_cast( + auto *device_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); @@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue( apply_scale<<>>( tptr, static_cast(scale_), n_q); - const platform::CUDADeviceContext &dev_ctx = *device_ctx; + const phi::GPUContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index 97b97aa3a4b85a948b10c2464c8ec244b7c24d77..6ac23e32856becf88e1a238d255ad2bdee272316 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -34,7 +34,7 @@ namespace tensorrt { class TensorRTDynamicEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test { framework::Tensor input_; framework::Tensor output_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTDynamicEngineTest, test_spmm) { @@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { std::vector inputs_; std::vector outputs_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 2836295f006f19a40a0002e321095fc1e00883a2..dc8065ab2a628e6525b96984fafad1c314e69ec2 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -26,7 +26,7 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test { framework::Tensor input_; framework::Tensor output_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu index 57782494eaf8cff502fafdbfcfee1fa4b66d4a0d..44bcc10abae1a8f788b22445b025dffd3a03d9d6 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) { std::unique_ptr(new BestFitAllocator(cuda_allocation.get()))); platform::CUDAPlace gpu(0); - platform::CUDADeviceContext dev_ctx(gpu); + phi::GPUContext dev_ctx(gpu); dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu, dev_ctx.stream()) .get()); @@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) { size_t* data = reinterpret_cast(allocation->ptr()); ForEachFill fill(data); - platform::ForRange for_range(dev_ctx, - allocate_size); + platform::ForRange for_range(dev_ctx, allocate_size); for_range(fill); memory::Copy(platform::CPUPlace(), diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 662bcc401bd743f2dce0ef2345992c623c135f46..f7e74e0421281300797411354cca49901b94c406 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -29,53 +29,51 @@ namespace memory { namespace allocation { /** - * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation. - * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath - * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream + * GPUContextAllocation is a wrapper of the underbeneath allocation. + * GPUContextAllocation adds a CUDA stream callback for the underbeneath + * allocation so that GPUContextAllocation can be used in a CUDA stream * which deletes allocation in the callback. */ -class CUDADeviceContextAllocation : public Allocation { +class GPUContextAllocation : public Allocation { public: - explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation) + explicit GPUContextAllocation(DecoratedAllocationPtr allocation) : Allocation(allocation->ptr(), allocation->base_ptr(), allocation->size(), allocation->place()), underlying_allocation_(std::move(allocation)) {} - ~CUDADeviceContextAllocation() { + ~GPUContextAllocation() { PADDLE_ENFORCE_NOT_NULL( dev_ctx_, platform::errors::PreconditionNotMet( - "Device context is not set for CUDADeviceContextAllocation")); + "Device context is not set for GPUContextAllocation")); auto *p_allocation = underlying_allocation_.release(); - VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at " + VLOG(4) << "Adding callback to delete GPUContextAllocation at " << p_allocation; dev_ctx_->AddStreamCallback([p_allocation] { - VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; + VLOG(4) << "Delete GPUContextAllocation at " << p_allocation; Allocator::AllocationDeleter(p_allocation); }); } - void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) { - dev_ctx_ = dev_ctx; - } + void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; } private: DecoratedAllocationPtr underlying_allocation_; - const platform::CUDADeviceContext *dev_ctx_{nullptr}; + const phi::GPUContext *dev_ctx_{nullptr}; }; /** - * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation + * GPUContextAllocator will allocate a GPUContextAllocation * after waiting for a self-created event on the default stream. It does so to * let the non-default stream be able to allocate GPU memory which will be * released by stream callback */ -class CUDADeviceContextAllocator : public Allocator { +class GPUContextAllocator : public Allocator { public: - explicit CUDADeviceContextAllocator(platform::CUDAPlace place, - gpuStream_t default_stream) + explicit GPUContextAllocator(platform::CUDAPlace place, + gpuStream_t default_stream) : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP @@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator { #endif } - ~CUDADeviceContextAllocator() { + ~GPUContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP @@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator { PADDLE_ENFORCE_NOT_NULL( default_stream_, platform::errors::PreconditionNotMet( - "Default stream is not set for CUDADeviceContextAllocator")); + "Default stream is not set for GPUContextAllocator")); platform::CUDADeviceGuard guard(place_.device); - auto allocation = new CUDADeviceContextAllocation( + auto allocation = new GPUContextAllocation( static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream #ifdef PADDLE_WITH_HIP @@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator { }; /** - * CUDADeviceContextAllocatorPool is a singletion stores mapping from - * CUDAPlace(s) to std::shared_ptr. When a - * CUDADeviceContext's compute stream isn't default stream, it can call this + * GPUContextAllocatorPool is a singletion stores mapping from + * CUDAPlace(s) to std::shared_ptr. When a + * phi::GPUContext's compute stream isn't default stream, it can call this * class to allocate GPU memory which will be released by a callback after * stream execution. */ -class CUDADeviceContextAllocatorPool { +class GPUContextAllocatorPool { public: - static CUDADeviceContextAllocatorPool &Instance() { - static CUDADeviceContextAllocatorPool pool; + static GPUContextAllocatorPool &Instance() { + static GPUContextAllocatorPool pool; return pool; } - AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { + AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) { auto iter = allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId())); PADDLE_ENFORCE_NE( @@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool { platform::errors::NotFound("No allocator found for CUDAPlace.")); auto &allocator = iter->second; AllocationPtr allocation = allocator->Allocate(size); - static_cast(allocation.get()) - ->SetCUDADeviceContext(&dev_ctx); + static_cast(allocation.get()) + ->SetGPUContext(&dev_ctx); return allocation; } private: - CUDADeviceContextAllocatorPool() { + GPUContextAllocatorPool() { std::vector devices = platform::GetSelectedDevices(); for (int i : devices) { auto place = platform::CUDAPlace(i); auto compute_stream = platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); - auto allocator = std::shared_ptr( - new CUDADeviceContextAllocator(place, compute_stream)); + auto allocator = std::shared_ptr( + new GPUContextAllocator(place, compute_stream)); allocators_.insert(make_pair(place, allocator)); } } - std::map> + std::map> allocators_; }; diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 05e712e72f27abfe470d6547366bd72d1fcfccb6..b3308ffdd3046d9cb87a90e2abc51b6dd24626a2 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -37,7 +37,7 @@ const int NUM_STREAMS = 8; const int N = 2; const float DELTA = 1e-1; -using CudaDevCtxVec = std::vector>; +using CudaDevCtxVec = std::vector>; __global__ void kernel(float *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) { void MultiStreamCompute(float **data, float **second_data, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // multi-streams AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); @@ -88,7 +88,7 @@ void MultiStreamCompute(float **data, #endif } -TEST(Malloc, CUDADeviceContextMultiStream) { +TEST(Malloc, GPUContextMultiStream) { auto place = platform::CUDAPlace(0); platform::SetDeviceId(0); @@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - auto ctx = std::unique_ptr( - new platform::CUDADeviceContext(place)); + auto ctx = std::unique_ptr(new phi::GPUContext(place)); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); @@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { } } -TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { +TEST(Malloc, GPUContextMultiThreadMultiStream) { auto place = platform::CUDAPlace(0); platform::SetDeviceId(0); @@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - auto ctx = std::unique_ptr( - new platform::CUDADeviceContext(place)); + auto ctx = std::unique_ptr(new phi::GPUContext(place)); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 96831b6bafc7930ae4415c7b5f09aae7c910e293..67f2df8cda5aaaa8d682211ebedca920a66da8c1 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { allocation_implicit_stream.reset(); gpuStream_t default_stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = @@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { size_t alloc_size = 256; gpuStream_t default_stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); std::shared_ptr allocation_implicit_stream = diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index fae675142bc0f9afae43306b9398c076bb4a2bab..49f78715c2cf59ce1dbaddf0b19f21b18fdf5ed5 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -19,8 +19,8 @@ namespace paddle { namespace operators { using framework::Tensor; +using phi::GPUContext; using platform::ActivationDescriptor; -using platform::CUDADeviceContext; using platform::TensorDescriptor; #ifdef PADDLE_WITH_HIP @@ -39,12 +39,12 @@ template struct CudnnActivationFunctor { using ELEMENT_TYPE = T; #ifdef PADDLE_WITH_HIP - CudnnActivationFunctor(const CUDADeviceContext& ctx, + CudnnActivationFunctor(const phi::GPUContext& ctx, const T& c, const miopenActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} #else - CudnnActivationFunctor(const CUDADeviceContext& ctx, + CudnnActivationFunctor(const phi::GPUContext& ctx, const T& c, const cudnnActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} @@ -77,7 +77,7 @@ struct CudnnActivationFunctor { out->mutable_data(ctx_.GetPlace()))); #endif } - const CUDADeviceContext& ctx_; + const phi::GPUContext& ctx_; const T coef_; #ifdef PADDLE_WITH_HIP const miopenActivationMode_t mode_; @@ -90,12 +90,12 @@ template struct CudnnActivationGradFunctor { using ELEMENT_TYPE = T; #ifdef PADDLE_WITH_HIP - CudnnActivationGradFunctor(const CUDADeviceContext& ctx, + CudnnActivationGradFunctor(const phi::GPUContext& ctx, const T& c, const miopenActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} #else - CudnnActivationGradFunctor(const CUDADeviceContext& ctx, + CudnnActivationGradFunctor(const phi::GPUContext& ctx, const T& c, const cudnnActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} @@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor { dx->mutable_data(ctx_.GetPlace()))); #endif } - const CUDADeviceContext& ctx_; + const phi::GPUContext& ctx_; const T coef_; #ifdef PADDLE_WITH_HIP const miopenActivationMode_t mode_; @@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor { template struct CudnnReluFunctor : public CudnnActivationFunctor { - explicit CudnnReluFunctor(const CUDADeviceContext& ctx) + explicit CudnnReluFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} }; template struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnReluGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { template struct CudnnRelu6Functor : public CudnnActivationFunctor { - explicit CudnnRelu6Functor(const CUDADeviceContext& ctx) + explicit CudnnRelu6Functor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} }; template struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { - explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) + explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor( ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} @@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { template struct CudnnSigmoidFunctor : public CudnnActivationFunctor { - explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx) + explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} }; template struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { template struct CudnnTanhFunctor : public CudnnActivationFunctor { - explicit CudnnTanhFunctor(const CUDADeviceContext& ctx) + explicit CudnnTanhFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} }; template struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -221,7 +221,7 @@ class CudnnActivationKernel framework::Tensor* Out = nullptr; ExtractActivationTensor(context, &X, &Out); Out->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = context.template device_context(); Functor functor(dev_ctx); functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out); } @@ -242,7 +242,7 @@ class CudnnActivationGradKernel ExtractActivationGradTensor( context, &X, &Out, &dOut, &dX); dX->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = context.template device_context(); Functor functor(dev_ctx); functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"), GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"), diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 9aafb70c7dce576374cc90fe7abdc5ef2aff7eac..76a05aa37a646c12f7c7149fa05d0d74af866254 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor = namespace ops = paddle::operators; namespace plat = paddle::platform; -#define REGISTER_ACTIVATION_CUDA_KERNEL( \ - act_type, op_name, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); -#define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \ - act_type, op_name, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); REGISTER_OP_CUDA_KERNEL( relu6, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( relu6_grad, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index f638f6943ffb89b62d84feb7c5b7fefa15938bc0..8fcdb323884183a00632976e6f32705f566a8242 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(affine_channel, ops::AffineChannelCUDAKernel, diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc index a5b3f9fcfda885c7cef64b0de3241ccd7712e764..48832ac1d6dadf274e5389b953de1486669c0871 100644 --- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel { platform::errors::InvalidArgument( "Only support for CUDAPlace.Please switch your context from " "CPUPlace to CUDAPlace or update your cudnn.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* theta = ctx.Input("Theta"); auto* output = ctx.Output("Output"); @@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel { "support for CUDAPlace. Please switch " "your context from CPUPlace to " "CUDAPlace or update your cudnn.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto output_grad = ctx.Input(framework::GradVarName("Output")); auto theta_grad = ctx.Output(framework::GradVarName("Theta")); diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu index b1ed3835e75249000399175cc699a194408d3a46..a5d4c6484a1f9b1bd01a99568a59edcdad40d651 100644 --- a/paddle/fluid/operators/affine_grid_op.cu +++ b/paddle/fluid/operators/affine_grid_op.cu @@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) { } template -struct Linspace { +struct Linspace { void operator()(T start, T end, int count, @@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel { w = size_attr[3]; } T* theta_grad_data = theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); - phi::funcs::SetConstant()( + phi::funcs::SetConstant()( ctx.cuda_device_context(), theta_grad, static_cast(0)); T h_step; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 0338fb5d2f2a5917f2d6195820016e1c977ed422..35b667825afb8eaafe097b5ac88826f61a2df5e3 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const auto xs = ctx.MultiInput("X"); const auto* scale = ctx.Input("Scale"); auto outs = ctx.MultiOutput("Out"); @@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); framework::Tensor inverse_scale = - ctx.AllocateTmpTensor({1}, - dev_ctx); + ctx.AllocateTmpTensor({1}, dev_ctx); MPDType* inverse_scale_v = inverse_scale.template data(); InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index d76dd13e5bcde7dc0134d9e7075bd7e249c8634a..4c927066892995b20fcaa79e18f79c1a1f301f06 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs, } template -class UpdateLossScalingFunctor { +class UpdateLossScalingFunctor { public: - void operator()(const platform::CUDADeviceContext& dev_ctx, + void operator()(const phi::GPUContext& dev_ctx, const bool* found_inf_data, const T* pre_loss_scaling_data, const int* good_in_data, @@ -134,9 +132,9 @@ class UpdateLossScalingFunctor -class LazyZeros { +class LazyZeros { public: - void operator()(const platform::CUDADeviceContext& dev_ctx, + void operator()(const phi::GPUContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { @@ -204,7 +202,7 @@ class LazyZeros { namespace ops = paddle::operators; namespace plat = paddle::platform; -using GPU = paddle::platform::CUDADeviceContext; +using GPU = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(update_loss_scaling, ops::UpdateLossScalingKernel, diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index a2af64e2276801a05c114c7b7c6477949d781df8..5fee66d968b73614c44de052b51c3089f0797a35 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function { Apply(static_cast(pool.Get(place))); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - Apply(static_cast(pool.Get(place))); + Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( platform::errors::Unavailable("Paddle is not compiled with CUDA.")); diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index 164744527e249b0ba0341cfffd02ccaa73d30829..3f36e8b13476d2f7f56f970b3e2456553d1f7310 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel { *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len); cpu_eff_num_len_data = cpu_eff_num_len.data()[0]; } - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data}); auto out_data = out->mutable_data(out_dims, place); diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index e97f12618454b5544909c4e0c236e9fd9ac401eb..362489e51acc231356dce927c0ba8d142e6b5cf4 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel { T* out_data = output->mutable_data(ctx.GetPlace()); // initialize auto out_eigen = framework::EigenVector::Flatten(*output); - auto& dev_ctx = ctx.template device_context(); - auto& place = *ctx.template device_context() - .eigen_device(); + auto& dev_ctx = ctx.template device_context(); + auto& place = + *ctx.template device_context().eigen_device(); out_eigen.device(place) = out_eigen.constant(static_cast(0)); CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel { int64_t strideA = ins_num * in_dim; int64_t strideB = in_dim * out_dim; - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.BatchedGEMM(transA, transB, ins_num, @@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { auto in_dim = input_dims[2]; auto out_dim = w_dims[2]; - auto& dev_ctx = ctx.template device_context(); - auto& place = *ctx.template device_context() - .eigen_device(); + auto& dev_ctx = ctx.template device_context(); + auto& place = + *ctx.template device_context().eigen_device(); // initialize dx->mutable_data(ctx.GetPlace()); auto dx_eigen = framework::EigenVector::Flatten(*dx); @@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { out_dim, db_data); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); T alpha = 1; T beta = 0; @@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(batch_fc, ops::BatchFCCUDAKernel, ops::BatchFCCUDAKernel); diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc index 15aca070221b057b66733dab6407a9ff575447cd..93f538e67890674c6bb502ab3c3aa4d8b4fa9555 100644 --- a/paddle/fluid/operators/beam_search_op.cu.cc +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -17,9 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); +REGISTER_OP_CUDA_KERNEL(beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index a4fa631f741f28c433caa6f79349e4b8103bce71..7afb3f1135d3458b2b8d23801ed0b7bb3c46eda0 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc REGISTER_OP_CUDA_KERNEL(transfer_dtype, ops::CastOpKernel, diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index bec1bb662de88b59df922b64c9efba401f62e6db..2548b13559133d19eb99b9236ebe0e907e318376 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(center_loss, ops::CenterLossCUDAKernel, ops::CenterLossCUDAKernel); diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc index afa350ef116c4bf4880d293df2dcdd7d73c199a9..ae9dd3401fd2164681ed6870aa25e95ac17b9e52 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc @@ -17,8 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL( cinn_instruction_run, - ops::CinnInstructionRunOpKernel); + ops::CinnInstructionRunOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc index 64980dfb013b487ab40fc3bd9234fee6b64a296f..7dbf2fee0c2bbbb682266dd0084b45a5bc0fd337 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc @@ -18,6 +18,4 @@ limitations under the License. */ /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL( - cinn_launch, - paddle::operators::CinnLaunchOpKernel); + cinn_launch, paddle::operators::CinnLaunchOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc index 26fee2d9e577c12e99534f249b1455b3ce9eee40..48efa5c51168e7975496605b0350c9b20200a556 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.cc +++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc @@ -21,10 +21,8 @@ namespace paddle::operators::details { #ifdef PADDLE_WITH_CUDA template <> -void* GetStream( - const framework::ExecutionContext& ctx) { - const auto& dev_ctx = - ctx.template device_context(); +void* GetStream(const framework::ExecutionContext& ctx) { + const auto& dev_ctx = ctx.template device_context(); return dev_ctx.stream(); } #endif diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h index 55ee3789c0a825d62eea42cf3b98f9a56ca76762..4387095fefaf6117db037870d4b57fe19b6a8864 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.h +++ b/paddle/fluid/operators/cinn/cinn_op_helper.h @@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) { #ifdef PADDLE_WITH_CUDA template <> -void* GetStream( - const framework::ExecutionContext& ctx); +void* GetStream(const framework::ExecutionContext& ctx); #endif } // namespace details diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index a0642694843e8afe73a9947876ee93648d402449..b92062b1aee244d8812e433a052be261da3c8ebf 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); // use global calculate stream const auto calcu_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) ->stream(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( @@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( class_center_sample, - ops::ClassCenterSampleCUDAKernel, - ops::ClassCenterSampleCUDAKernel); + ops::ClassCenterSampleCUDAKernel, + ops::ClassCenterSampleCUDAKernel); diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 561d2696fef8500c2125befb410f86f51a42e209..4a11e6d5723bd9e0e20dfc59d67d12ab17930934 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( coalesce_tensor, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #endif #if defined(PADDLE_WITH_ASCEND_CL) diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index af299fc6b5af73ceffd39bcd1b0d5bd88a9534df..174a5afa69dc60c373f65f00796a8398ed42d7e9 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -17,10 +17,9 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - allreduce, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel); +REGISTER_OP_CUDA_KERNEL(allreduce, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel); diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h index 12708ab666db6cf2f4ce695e5a7aa31c9a994f82..12507d76fe73a17188fc2c27a9750865b28eca85 100644 --- a/paddle/fluid/operators/collective/allreduce_op.h +++ b/paddle/fluid/operators/collective/allreduce_op.h @@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel { platform::errors::PreconditionNotMet( "AllReduce op can run on gpu place only for now.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto in = ctx.Input("X"); auto out = ctx.Output("Out"); diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index ef59772b173e0f1ce1f4875ebe4da4db03bd9509..718f60c7737c1bc90ad666701ba635bce94433e2 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index c59d8315a369e29d1bb4f5f0b8d11366e1a3fa9e..de15395eb4df5504408f23a1344b0b04898ac700 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel { int rid = ctx.Attr("ring_id"); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto stream = static_cast(dev_ctx)->stream(); + auto stream = static_cast(dev_ctx)->stream(); ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc index ceac881bff19f573b9d1bd159cfb9407e1473597..4f21dc2992a39f3dd838be13fab8d5a9c2ac98a8 100644 --- a/paddle/fluid/operators/collective/broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc @@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel { platform::errors::PreconditionNotMet("Currently, the broadcast op can " "only be an In-Place operation.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto comm = dev_ctx.nccl_comm(); auto stream = dev_ctx.stream(); diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index e9228a28dbac05131dd1ad0138a0b556e1a2f3dc..8356bbb65a8a76c912d3e83eebde5fe58195402f 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index e679fb2fe9c1365df2ee8eff09dff1cca12245fe..718c77aaa6ff20f5e775bd810a73fc681d06ffb0 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 33e320816de7b80b124a1917c6248eed2448915d..e43c67d7bf369b10e27cdbd88de59a8b8f391b2a 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index 3fb2047dc27840d8c7aa4cc0e147c74b496d44d1..74bdd2b63ae5732900bcf51bf240e23ce6a4f5e6 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { T* recv_buff = temp_out.data(); gpuStream_t stream = nullptr; auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclAllGather(send_buff, @@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel { offset += rows_per_tensor; } - math::ConcatFunctor functor; + math::ConcatFunctor functor; out->mutable_data(out_dims, place); - auto& dev_ctx2 = ctx.template device_context(); + auto& dev_ctx2 = ctx.template device_context(); functor(dev_ctx2, inputs, axis, out); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 3f14c0ac9c1a79eab16e98ceeb619dd277e7e214..53aef8e8357343a1a9891643d2aa6b2c482dddd0 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel { auto *ids_t = context.Input("Ids"); auto *output_t = context.Output("Out"); - const auto &dev_ctx = - context.template device_context(); + const auto &dev_ctx = context.template device_context(); const int64_t start_idx = context.Attr("start_index"); size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; @@ -142,8 +141,7 @@ template class CEmbeddingGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const auto &dev_ctx = - context.template device_context(); + const auto &dev_ctx = context.template device_context(); const int64_t start_idx = context.Attr("start_index"); auto ids_t = context.Input("Ids"); auto d_output_t = context.Input(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index f2e6cdbe2ca665ad9fbe7135a94bcfac0838fe2f..dae4fa497f7fb92f01aee2bf598049f746abade9 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index 33617d8787d31222872a2ac54d5c0f7acf9e779d..354c31c213b6374990808cab4936d7f22862e325 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc index b7e6262b81e197c81310e235d89e14329ff268dd..42d9ed2342ca01d1e60a0a9bc995819d4249b39f 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc @@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index de83bc773baa59a8ab0f69644051011dfdb8a50e..ef7e298aaf6a3cbe1f9b9d1464a3be882ac0003f 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor { const auto& place = ctx.GetPlace(); const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); // use global calculate stream - const auto stream = static_cast( + const auto stream = static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 1, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* logits_max_buff = logits_max.mutable_data(place); auto eigen_logits_max = math::EigenMatrix::From(logits_max); @@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 3, obtain predict target Tensor predicted_logits; predicted_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + ctx.AllocateTmpTensor({N, 1}, dev_ctx); predicted_logits.mutable_data(place); auto t = framework::EigenVector::Flatten(predicted_logits); @@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 5, obtain sum_exp_logits Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); @@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { const int rank = ctx.Attr("rank"); const auto& place = ctx.GetPlace(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto map = distributed::ProcessGroupMapFromGid::getInstance(); distributed::ProcessGroup* pg = map->get(rid); @@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 1, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); auto eigen_logits_max = math::EigenMatrix::From(logits_max); Eigen::DSizes along_axis(1); @@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 3, obtain predict target Tensor predicted_logits; predicted_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + ctx.AllocateTmpTensor({N, 1}, dev_ctx); predicted_logits.mutable_data(place); auto t = framework::EigenVector::Flatten(predicted_logits); @@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 5, obtain sum_exp_logits Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); @@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); const int rank = context.Attr("rank"); - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); if (logit_grad != softmax) { framework::TensorCopy( diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 06c251e32cfc223ef92d32796d3eca8e27a3a3fe..5b34e4ba9d59466a7f89cbc534957d083d36b25e 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -83,7 +83,7 @@ class CSplitOpCUDAKernel : public framework::OpKernel { rank, nranks)); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto dims = x->dims(); auto dims_size = dims.size(); // final dim diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index f800be642f7213d608ad7ba836cf0b86debe472b..5b26e47a8fdc7abd1e045b129ab5b2c103014592 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) auto place = ctx.GetPlace(); - auto dev_ctx = static_cast( + auto dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); platform::GpuStreamSync(dev_ctx->stream()); diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index 4a60f255b47d3dd70a75d8e77d7bba7f3d6f87e5..bacbe014a343cedf2901975208eefdbf00466d28 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase { int ring_id = Attr("ring_id"); auto compute_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto comm_stream = diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index cec57c6bfd77ed313fd8ad8cbaaf4557f7201534..34569b0a4b600349821091741e315958e85afc8b 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase { int ring_id = Attr("ring_id"); auto compute_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto comm_stream = diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 2b8ba4049c545a5d7568f84cd1be39e3a5389375..3d7ab09f45e7d78f441ebeb1157a6ccb9d2ce686 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -83,7 +83,7 @@ struct GlobalGatherFunctor { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index b8b260c74cecbbf5cb8c8aa38e8508dfb49c6375..1337901f185af48d33ddbf849bc428f5d109f48c 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -82,7 +82,7 @@ struct GlobalScatterFunctor { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index 7e25f6876adb0f42e92af837cb1f9cccec2bf67e..6bc18254737d321cb129a8b0e585e70f31bd5a3b 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -81,7 +81,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index da6690a96a19a1a8836dad27350440357fff8c08..526f942599210a686e4196cb5f307392d34c3301 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -82,7 +82,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index 874bd61d198971f7a9d8d44d4984037c247af166..84b1e7148df0216eaf69721898a2e2a16cfbfcba 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -78,7 +78,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index 944644f4101b9f4f31acfaad9ae0d46397179166..ec18a172e1f8bdff6a63ad10298c9baeed5ccfce 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -158,7 +158,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 063eb5c1f82b22be1b0022a29ce692eefb9396af..37b18703031de36ae0318a1f66fac79ffe6f579b 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -153,7 +153,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 2e65e9f352dc3285f22fd07f88f32ac771dfeccc..89b703d8d1a5d801962424b87081b52f72820c21 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -124,8 +124,7 @@ __global__ void ConvShiftDy(const T *x, } // namespace template -class ConvShiftKernel - : public framework::OpKernel { +class ConvShiftKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -146,8 +145,7 @@ class ConvShiftKernel dim3 grid_dim(num_x_blocks, batch_size); - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); ConvShiftForward<<>>( x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); @@ -155,8 +153,7 @@ class ConvShiftKernel }; template -class ConvShiftGradKernel - : public framework::OpKernel { +class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -174,9 +171,8 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto &device_ctx = - context.template device_context(); - phi::funcs::SetConstant zero; + auto &device_ctx = context.template device_context(); + phi::funcs::SetConstant zero; const int x_per_block = 256; int num_x_blocks = DivUp(x_width, x_per_block); @@ -212,9 +208,7 @@ class ConvShiftGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - conv_shift, - ops::ConvShiftKernel); -REGISTER_OP_CUDA_KERNEL( - conv_shift_grad, - ops::ConvShiftGradKernel); +REGISTER_OP_CUDA_KERNEL(conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL(conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 746debe21e58ff071dc7f94e06745cf6b7921037..3205d5b353887c21f6029c1b4e5dc1c55437cd53 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -44,7 +44,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input"); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; if (use_cudnn) { library_ = framework::LibraryType::kCUDNN; @@ -348,7 +348,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif @@ -435,7 +435,7 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index e753b148fe1a6da652f8123659b1b3e923c8fc1a..3172625681a236d00ea903e3393a47c6e29f08b4 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -132,7 +132,7 @@ void Compare2(f::Scope* scope, #ifdef PADDLE_WITH_CUDA TEST(copy_cross_scope, CUDA_fp32) { f::Scope scope; - p::CUDADeviceContext ctx(p::CUDAPlace(0)); + phi::GPUContext ctx(p::CUDAPlace(0)); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(p::CUDAPlace(0), ctx.stream()) .get()); @@ -142,7 +142,7 @@ TEST(copy_cross_scope, CUDA_fp32) { TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { f::Scope scope; - p::CUDADeviceContext ctx(p::CUDAPlace(0)); + phi::GPUContext ctx(p::CUDAPlace(0)); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(p::CUDAPlace(0), ctx.stream()) .get()); diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index d632de3ac86c20d715af98cb5cde1a5135d3306c..434506c033c4dc660885d41e2d8b5da1116c245a 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -176,7 +176,7 @@ __global__ void correlation_forward(T *output, } } -// class CorrelationKernel +// class CorrelationKernel template class CorrelationCUDAKernel : public framework::OpKernel { public: @@ -197,7 +197,7 @@ class CorrelationCUDAKernel : public framework::OpKernel { auto *output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // base on input1, NCHW auto in_dims = input1->dims(); @@ -209,11 +209,11 @@ class CorrelationCUDAKernel : public framework::OpKernel { int padded_input_height = H + 2 * pad_size; int padded_input_width = W + 2 * pad_size; - Tensor rinput1 = ctx.AllocateTmpTensor( + Tensor rinput1 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput1.mutable_data(ctx.GetPlace()); - Tensor rinput2 = ctx.AllocateTmpTensor( + Tensor rinput2 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput2.mutable_data(ctx.GetPlace()); @@ -468,7 +468,7 @@ class CorrelationCUDAGradKernel : public framework::OpKernel { grad_input1->mutable_data(ctx.GetPlace()); auto *grad_input2 = ctx.Output(framework::GradVarName("Input2")); grad_input2->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto in_dims = input1->dims(); int N = in_dims[0]; @@ -479,11 +479,11 @@ class CorrelationCUDAGradKernel : public framework::OpKernel { int padded_input_height = H + 2 * pad_size; int padded_input_width = W + 2 * pad_size; - Tensor rinput1 = ctx.AllocateTmpTensor( + Tensor rinput1 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput1.mutable_data(ctx.GetPlace()); - Tensor rinput2 = ctx.AllocateTmpTensor( + Tensor rinput2 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput2.mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu index 3d144ca29d9989ad2cbb438a950860eaac873d07..5599a9b19b014721bfe7a0aa10696be1d15e3a43 100644 --- a/paddle/fluid/operators/cos_sim_op.cu +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -14,8 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/cos_sim_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cos_sim, ops::CosSimKernel); -REGISTER_OP_CUDA_KERNEL( - cos_sim_grad, - ops::CosSimGradKernel); +REGISTER_OP_CUDA_KERNEL(cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL(cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index bdc1f61fbe0ebcfd6b0116dab80172c00a2b9ec6..41e9d673d3fe2784cdff4e15735b8c72f3fa0bb2 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -230,11 +230,9 @@ REGISTER_OP_CPU_KERNEL(crop_grad, ops::CropGradKernel, ops::CropGradKernel); -REGISTER_OP_CUDA_KERNEL( - crop, - ops::CropKernel, - ops::CropKernel); -REGISTER_OP_CUDA_KERNEL( - crop_grad, - ops::CropGradKernel, - ops::CropGradKernel); +REGISTER_OP_CUDA_KERNEL(crop, + ops::CropKernel, + ops::CropKernel); +REGISTER_OP_CUDA_KERNEL(crop_grad, + ops::CropGradKernel, + ops::CropGradKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu index cabe21919a95f605dbd9752be973c4abcbe88af7..2557532a940f48210d8ced3fab92afe004ca3c30 100644 --- a/paddle/fluid/operators/cross_entropy_op.cu +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ namespace plat = paddle::platform; namespace ops = paddle::operators; -using CUDACtx = paddle::platform::CUDADeviceContext; +using CUDACtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, ops::CrossEntropyOpKernel, diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index 25d2c4e77d1ff13fc87f0e4559458c5400200832..2095b3d3858e34dbc97d69910975ec615fc625d7 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -154,10 +154,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { if (host_out_lod0.back() == 0) { output->Resize({1, 1}); output->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(ctx.template device_context(), - output, - -1); + phi::funcs::SetConstant set_constant; + set_constant( + ctx.template device_context(), output, -1); } } } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index bf3009e1fe23463b1de91edb8fa393c624e040e1..d53333d2176039043f2f55c4134989387ce582ab 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -247,7 +247,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { SequenceLength = operators::GetDataFromTensor(sequence_length); } - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); int seq_length = x->dims()[0]; @@ -262,9 +262,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { int weight_numel; bool w_initialized = false; auto place = ctx.GetPlace(); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()) + .stream(); if (is_test && ctx.HasInput("W")) { auto *W = ctx.Input("W"); w_initialized = W->IsInitialized() ? true : false; @@ -460,7 +460,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto weight_grad_list = ctx.MultiOutput( framework::GradVarName("WeightList")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto input_dims = input->dims(); @@ -479,9 +479,9 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { bool continuous = is_continuous>(weight_list); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()) + .stream(); Tensor weight_whole; T *weight_data = nullptr; @@ -494,7 +494,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { } Tensor weight_grad; - phi::funcs::SetConstant zero; + phi::funcs::SetConstant zero; weight_grad.mutable_data({weight_numel}, ctx.GetPlace()); zero(dev_ctx, &weight_grad, static_cast(0.0)); T *weight_grad_data = weight_grad.data(); diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index a5c3b51d300be0ac4f387c7bc995d961e2b11a81..d08d9e14ef06e91824576b9b7423bef932affe40 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -99,8 +99,7 @@ class CVMCUDAKernel : public framework::OpKernel { T* y_data = y->mutable_data(context.GetPlace()); // for Input X do not have Lod Information. - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); if (x->NumLevels() == 0) { CvmComputeKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, @@ -147,8 +146,7 @@ class CVMGradCUDAKernel : public framework::OpKernel { auto item_size = dx_numel / batch_size; // for Input X do not have Lod Information. - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); if (dx->NumLevels() == 0) { CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index f87c88b2aaf2a5a6a446d75682d752f2d4f501a9..e3f510e755b9ce9d55eb2667efb6bf1e03081484 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -104,8 +104,7 @@ __global__ void KernelUpdateParam(int C, } template -class DataNormKernel - : public framework::OpKernel { +class DataNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -130,8 +129,7 @@ class DataNormKernel T *scale_out_data = ctx.Output("Scales")->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); KernelMeanScale<<>>( C, @@ -146,8 +144,7 @@ class DataNormKernel }; template -class DataNormGradKernel - : public framework::OpKernel { +class DataNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -180,8 +177,7 @@ class DataNormGradKernel ctx.Output(framework::GradVarName("BatchSquareSum")) ->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); if (d_x != nullptr) { KernelDataNormBP<< } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - data_norm, - ops::DataNormKernel, - ops::DataNormKernel); -REGISTER_OP_CUDA_KERNEL( - data_norm_grad, - ops::DataNormGradKernel, - ops::DataNormGradKernel); +REGISTER_OP_CUDA_KERNEL(data_norm, + ops::DataNormKernel, + ops::DataNormKernel); +REGISTER_OP_CUDA_KERNEL(data_norm_grad, + ops::DataNormGradKernel, + ops::DataNormGradKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 18a38a0471d06cbc0ac7a846200ab95dce213137..d974a60197d683ed85c66105814f656fed99c5ce 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -624,7 +624,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling, ops::DeformablePSROIPoolCUDAKernel, ops::DeformablePSROIPoolCUDAKernel); diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cu b/paddle/fluid/operators/dequantize_abs_max_op.cu index 964f740a03fe759c9662594b49368afd7820ce01..57d2c02adb0950cf0c6c08447dabb0885e25a6f8 100644 --- a/paddle/fluid/operators/dequantize_abs_max_op.cu +++ b/paddle/fluid/operators/dequantize_abs_max_op.cu @@ -27,8 +27,8 @@ __global__ void KeDequantize( } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, float max_range, @@ -46,14 +46,14 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_abs_max, ops::DequantizeMaxAbsKernel, ops::DequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index dc4e03a858f4b58c6866fbd22c72eec21b48fe4c..2c47d9b17aa068cfa0b879812c2fb894c8e7da54 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -36,8 +36,8 @@ __global__ void KeDequantize(const T* in, } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* dict, framework::Tensor* out) { @@ -54,11 +54,11 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; +template struct DequantizeFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_log, ops::DequantizeLogKernel); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 82678d456c3d66624f03a86a4d6edcf6db423cca..4c729a65f59ed7c67fbca04f0fd26ecf30f1d93b 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -39,8 +39,7 @@ struct StridedMemcpyFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(dev_ctx); + auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy( gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream()); #else @@ -66,8 +65,7 @@ struct StridedMemcpyFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(dev_ctx); + auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu index 036a33cff8e52583eca40f34fe13c2f857205060..30250eb8cc0481f5bec3a18ac501b6db091b59e5 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cu +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -96,8 +96,7 @@ class AnchorGeneratorOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (box_num + block - 1) / block; - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); anchors->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index e41f4e9b3b7e8504cac4585f38f48b49917b7b70..90be767e2f2134d0985e43bf2ea5462a455ba4da 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -47,14 +47,14 @@ struct RangeInitFunctor { }; template -static void SortDescending(const platform::CUDADeviceContext &ctx, +static void SortDescending(const phi::GPUContext &ctx, const Tensor &value, Tensor *value_out, Tensor *index_out) { int num = static_cast(value.numel()); Tensor index_in_t; int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); - platform::ForRange for_range(ctx, num); + platform::ForRange for_range(ctx, num); for_range(RangeInitFunctor{0, 1, idx_in}); int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); @@ -287,7 +287,7 @@ static __global__ void NMSKernel(const int n_boxes, } template -static void NMS(const platform::CUDADeviceContext &ctx, +static void NMS(const phi::GPUContext &ctx, const Tensor &proposals, const Tensor &sorted_indices, const T nms_threshold, diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index 084faf32e6b4b564e34f5c3a31fc70ae7a542547..87dc4a30abb318d22d08f76ace36bcdc94aa37fa 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -75,7 +75,6 @@ class GPUBoxClipKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - box_clip, - ops::GPUBoxClipKernel, - ops::GPUBoxClipKernel); +REGISTER_OP_CUDA_KERNEL(box_clip, + ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu index 7f66cb86b569345ad71624b4a6f4c0e4d301b63c..f87a636bdfb02cb0a4b0de7ca95666a5678c5010 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -152,7 +152,5 @@ class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( box_decoder_and_assign, - ops::BoxDecoderAndAssignCUDAKernel, - ops::BoxDecoderAndAssignCUDAKernel); + ops::BoxDecoderAndAssignCUDAKernel, + ops::BoxDecoderAndAssignCUDAKernel); diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 01346c94fa66aed7f293852b771c9afb53fa0934..0fbc54d3135d682b0d30cb3540d54e38c3fa7345 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -138,8 +138,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor index_in_t; int* idx_in = index_in_t.mutable_data({total_roi_num}, dev_ctx.GetPlace()); - platform::ForRange for_range_total( - dev_ctx, total_roi_num); + platform::ForRange for_range_total(dev_ctx, total_roi_num); for_range_total(RangeInitFunctor{0, 1, idx_in}); Tensor keys_out_t; @@ -188,8 +187,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor batch_index_t; int* batch_idx_in = batch_index_t.mutable_data({real_post_num}, dev_ctx.GetPlace()); - platform::ForRange for_range_post( - dev_ctx, real_post_num); + platform::ForRange for_range_post(dev_ctx, real_post_num); for_range_post(RangeInitFunctor{0, 1, batch_idx_in}); Tensor out_id_t; @@ -228,7 +226,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor length_lod; int* length_lod_data = length_lod.mutable_data({lod_size}, dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &length_lod, static_cast(0)); int blocks = NumBlocks(real_post_num); @@ -274,7 +272,5 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( collect_fpn_proposals, - ops::GPUCollectFpnProposalsOpKernel, - ops::GPUCollectFpnProposalsOpKernel); + ops::GPUCollectFpnProposalsOpKernel, + ops::GPUCollectFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 8521b28127bc258d77d68577ee80f61ffbec7e50..aa60d054546cd80358fc482d642e4ac215208f42 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -164,8 +164,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); GenDensityPriorBox<<>>(feature_height, feature_width, img_height, diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 3fd309aee402a98fbf95e1d26be1da2c6011f113..1063382ef338a4f0ac7c060ccecee4a153a9fbf2 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -129,7 +129,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { Tensor sub_lod_list; sub_lod_list.Resize({num_level, lod_size}); int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &sub_lod_list, static_cast(0)); Tensor target_lvls; @@ -155,7 +155,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { Tensor index_in_t; int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); - platform::ForRange for_range(dev_ctx, roi_num); + platform::ForRange for_range(dev_ctx, roi_num); for_range(RangeInitFunctor{0, 1, idx_in}); Tensor keys_out_t; @@ -258,7 +258,5 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( distribute_fpn_proposals, - ops::GPUDistributeFpnProposalsOpKernel, - ops::GPUDistributeFpnProposalsOpKernel); + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 00ffeebc08b185c9f115d0d04d3a0d12c7785361..ed1ad6da34d4add9b8a5ae157e3091cdde9b123a 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor; namespace { template static std::pair ProposalForOneImage( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const Tensor &im_info, const Tensor &anchors, const Tensor &variances, @@ -59,7 +59,7 @@ static std::pair ProposalForOneImage( proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); { - platform::ForRange for_range(ctx, pre_nms_num); + platform::ForRange for_range(ctx, pre_nms_num); for_range(BoxDecodeAndClipFunctor{anchors.data(), bbox_deltas.data(), variances.data(), @@ -94,7 +94,7 @@ static std::pair ProposalForOneImage( Tensor scores_filter, proposals_filter; // Handle the case when there is no keep index left if (keep_num == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); scores_filter.mutable_data({1, 1}, ctx.GetPlace()); set_zero(ctx, &proposals_filter, static_cast(0)); @@ -266,5 +266,4 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( generate_proposals, - ops::CUDAGenerateProposalsKernel); + ops::CUDAGenerateProposalsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index d5005f435f25abecea38487d321102dd64d40a62..682a9adf659525a61159254bc496e1ce177e8076 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor; namespace { template static std::pair ProposalForOneImage( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const Tensor &im_shape, const Tensor &anchors, const Tensor &variances, @@ -60,7 +60,7 @@ static std::pair ProposalForOneImage( proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); { - platform::ForRange for_range(ctx, pre_nms_num); + platform::ForRange for_range(ctx, pre_nms_num); for_range(BoxDecodeAndClipFunctor{anchors.data(), bbox_deltas.data(), variances.data(), @@ -98,7 +98,7 @@ static std::pair ProposalForOneImage( Tensor scores_filter, proposals_filter; // Handle the case when there is no keep index left if (keep_num == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); scores_filter.mutable_data({1, 1}, ctx.GetPlace()); set_zero(ctx, &proposals_filter, static_cast(0)); @@ -274,5 +274,4 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( generate_proposals_v2, - ops::CUDAGenerateProposalsV2Kernel); + ops::CUDAGenerateProposalsV2Kernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu index 8342b4138c87e6ea1803146bac6d6954a569ef5f..dc27f32653888c7e995ba38719d384e0b769424a 100644 --- a/paddle/fluid/operators/detection/iou_similarity_op.cu +++ b/paddle/fluid/operators/detection/iou_similarity_op.cu @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/iou_similarity_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - iou_similarity, - ops::IOUSimilarityKernel, - ops::IOUSimilarityKernel); +REGISTER_OP_CUDA_KERNEL(iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu index 1808806714774f7444c687ef08d9ee7f0e15cc2e..1cdf7691338294a2787f4aae11bc6bf2178f4e20 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cu +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -149,8 +149,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (box_num + block - 1) / block; - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 5bf68c154c66dfb9fc19e30f8b9f892a7e6e4b97..73b28f8f0e4761ae2c27326a454d08b2f29b3ed8 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -382,7 +382,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { T* out2in_w_data = out2in_w->mutable_data({out->numel(), 4}, ctx.GetPlace()); - phi::funcs::SetConstant init; + phi::funcs::SetConstant init; init(ctx.cuda_device_context(), out2in_idx, static_cast(-1)); auto transformed_height = ctx.Attr("transformed_height"); @@ -519,7 +519,7 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel { T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); const T* out_grad_data = out_grad->data(); diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index 9ca480ba72760cddc278f03f2f335b5d23587487..3def90fd459e5bccefeb28873402db8d838f28e1 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -185,12 +185,9 @@ class GPUSigmoidFocalLossGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sigmoid_focal_loss, - ops::GPUSigmoidFocalLossKernel, - ops::GPUSigmoidFocalLossKernel); + ops::GPUSigmoidFocalLossKernel, + ops::GPUSigmoidFocalLossKernel); REGISTER_OP_CUDA_KERNEL( sigmoid_focal_loss_grad, - ops::GPUSigmoidFocalLossGradKernel, - ops::GPUSigmoidFocalLossGradKernel); + ops::GPUSigmoidFocalLossGradKernel, + ops::GPUSigmoidFocalLossGradKernel); diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu index 1ab698998c7290dd2f4b2071b1f56e06c5cf4632..337f55a3ca81fa7a680efb0b708a4790ffb5ad26 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cu +++ b/paddle/fluid/operators/detection/target_assign_op.cu @@ -41,8 +41,8 @@ __global__ void NegTargetAssignKernel(const int* neg_indices, } template -struct NegTargetAssignFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct NegTargetAssignFunctor { + void operator()(const phi::GPUContext& ctx, const int* neg_indices, const size_t* lod, const int N, @@ -58,16 +58,13 @@ struct NegTargetAssignFunctor { } }; -template struct NegTargetAssignFunctor; -template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - target_assign, - ops::TargetAssignKernel, - ops::TargetAssignKernel); +REGISTER_OP_CUDA_KERNEL(target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu index e7f564b7ab4d1c11810dc096faec7f5a375b8563..9926d0e54369bfa4aaebbcb3821885a3f609abc4 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu @@ -15,6 +15,5 @@ limitations under the License. */ #include "paddle/fluid/operators/dgc_clip_by_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc_clip_by_norm, - ops::DGCClipByNormKernel); +REGISTER_OP_CUDA_KERNEL(dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu index 0f0bf441a70bef9cb69362a9cf333aeb51e835b6..e8aa9b5245df7c0cb2c2c30491a8a0184ff6bced 100644 --- a/paddle/fluid/operators/dgc_op.cu +++ b/paddle/fluid/operators/dgc_op.cu @@ -16,5 +16,4 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc, ops::DGCOpKernel); +REGISTER_OP_CUDA_KERNEL(dgc, ops::DGCOpKernel); diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu index c40206b0032e0c5c87af29ca4f660127f3ad8803..c9afc983b03bbcb59b9e996fade53bc7c2f55344 100644 --- a/paddle/fluid/operators/diag_op.cu +++ b/paddle/fluid/operators/diag_op.cu @@ -16,9 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - diag, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel); +REGISTER_OP_CUDA_KERNEL(diag, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel); diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc index 3fa1b6ef08c5641c33271582858053cfc4a289c1..92e5d66776d6e0206e09c2b196c62645f3fa025d 100644 --- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc @@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope, auto dims = phi::make_ddim(shape); tensor->Resize(dims); platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); inference::tensorrt::RandomizeTensor(tensor, place, ctx); } @@ -127,7 +127,7 @@ TEST(DlnneEngineOp, manual) { framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); // Prepare variables. CreateCUDATensor(&scope, "x", std::vector({2, 4})); CreateCUDATensor(&scope, "y", std::vector({4, 6})); @@ -145,7 +145,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 7733d202e5781ab2c8ee884478c0ea87d7e2b9d0..d51c57d6eab8d0fa12a7e2e2b4c2edf1e42d6828 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -98,7 +98,7 @@ TEST(Dropout, CPUDense) { TEST(Dropout, GPUDense) { f::Scope scope; p::CUDAPlace place; - p::CUDADeviceContext ctx(place); + p::phi::GPUContext ctx(place); Compare(scope, ctx); } */ diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 369fea2b0b189bd568aa43b0a616fbb19522878d..681f91ffa689df337c683149057fa3987ed45871 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -85,9 +85,8 @@ class EditDistanceGPUKernel : public framework::OpKernel { auto batch_size = x1_t->dims()[0]; auto normalized = ctx.Attr("normalized"); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()).stream(); framework::Vector hyp_lod(batch_size + 1); framework::Vector ref_lod(batch_size + 1); @@ -124,8 +123,8 @@ class EditDistanceGPUKernel : public framework::OpKernel { } const size_t num_strs = hyp_lod.size() - 1; - phi::funcs::SetConstant set_constant; - set_constant(ctx.template device_context(), + phi::funcs::SetConstant set_constant; + set_constant(ctx.template device_context(), sequence_num, static_cast(num_strs)); diff --git a/paddle/fluid/operators/eigvalsh_op.cu b/paddle/fluid/operators/eigvalsh_op.cu index 3ed431f8002308244cbd50216af99ca300256a58..880570d1be09b9dcf2eab7a2a9139e39ba40e4fb 100644 --- a/paddle/fluid/operators/eigvalsh_op.cu +++ b/paddle/fluid/operators/eigvalsh_op.cu @@ -16,25 +16,23 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigvalsh, - ops::EigvalshKernel, - ops::EigvalshKernel, - ops::EigvalshKernel>, - ops::EigvalshKernel>); +REGISTER_OP_CUDA_KERNEL(eigvalsh, + ops::EigvalshKernel, + ops::EigvalshKernel, + ops::EigvalshKernel>, + ops::EigvalshKernel>); REGISTER_OP_CUDA_KERNEL( eigvalsh_grad, - ops::EigvalshGradKernel, - ops:: - EigvalshGradKernel, - ops::EigvalshGradKernel, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel>, - ops::EigvalshGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index b0b0db5cde424de3589ff23f2134b5da77c8d4df..f81b76aa4877c38275e7fb7ac2b443ab3b98d4d5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -1596,7 +1596,7 @@ static inline std::vector GetReduceDim(const framework::DDim &in, #if defined(__NVCC__) || defined(__HIPCC__) template -void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, +void GetGradXAndYOut(const phi::GPUContext &dev_ctx, const platform::Place &place, int axis, std::vector ins, @@ -1609,7 +1609,7 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, } template -void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, +void GetGradXOrYOut(const phi::GPUContext &dev_ctx, const platform::Place &place, int axis, std::vector ins, diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 6f1e04ebfa6cfaf666fbd09eeddb341884425371..3d32c9b8a148f75334e9a797cf40cb46e0ca1f71 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -158,17 +158,15 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); +REGISTER_OP_CUDA_KERNEL(expand_as, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel); +REGISTER_OP_CUDA_KERNEL(expand_as_grad, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel); #endif diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index d8c66f95a1395e9a8f344785d536c66cf0aeae96..1261b7777010e02561676e4b0420de595853c352 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -294,19 +294,17 @@ REGISTER_OP_CPU_KERNEL(expand_grad, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( expand, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel); + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); REGISTER_OP_CUDA_KERNEL( expand_grad, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel); + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel); #endif diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 5562baca97ff5ed6027006833b07956bc7b0f90b..34855fbc96e12f465c9f986d2357dfb2944d0037 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.cu.h" namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; using float16 = paddle::platform::float16; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h index 65dfad185c186d8268af942854e98897d4eb2e62..161b87ea392590d7db7b4a341ab9c629bc9da13a 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu.h +++ b/paddle/fluid/operators/fake_dequantize_op.cu.h @@ -31,8 +31,8 @@ __global__ void KeDequantize( } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, T max_range, @@ -102,8 +102,8 @@ __global__ void DequantizeTwoScale(const T* in, } template -struct ChannelDequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct ChannelDequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, @@ -163,10 +163,10 @@ struct ChannelDequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; -template struct ChannelDequantizeFunctor; -template struct ChannelDequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index c7ad664b7da041a7f86adcb0deef021816ccfbb0..a19369fc6f2ce34256f853ddd7f74f937d5a0f82 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_quantize_op.cu.h" namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; using float16 = paddle::platform::float16; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index 3b1877f2bc8e55327ed9b4102399dcdd8346904f..22ba8254cdc2c2dcf0d668639ab110fc06c94622 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -72,8 +72,8 @@ __global__ void FindAbsMaxKernel(const T *in, const int n, T *out) { } template -struct FindAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const T *in, const int num, T *out) { @@ -90,9 +90,8 @@ struct FindAbsMaxFunctor { } }; -template struct FindAbsMaxFunctor; -template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; template __global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in, @@ -164,8 +163,8 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1( } template -struct FindChannelAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindChannelAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in_tensor, const int quant_axis, T *out_abs_max) { @@ -215,7 +214,7 @@ struct FindChannelAbsMaxFunctor { } }; -template struct FindChannelAbsMaxFunctor; +template struct FindChannelAbsMaxFunctor; template __global__ void ClipAndQuantKernel(const T *in, @@ -289,8 +288,8 @@ __global__ void ClipAndQuantDequantKernel(const T *in, } template -struct ClipAndFakeQuantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ClipAndFakeQuantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -309,11 +308,11 @@ struct ClipAndFakeQuantFunctor { } }; -template struct ClipAndFakeQuantFunctor; +template struct ClipAndFakeQuantFunctor; template -struct ClipAndFakeQuantDequantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ClipAndFakeQuantDequantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -408,8 +407,8 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in, } template -struct ChannelClipAndFakeQuantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ChannelClipAndFakeQuantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -462,8 +461,7 @@ struct ChannelClipAndFakeQuantFunctor { } }; -template struct ChannelClipAndFakeQuantFunctor; +template struct ChannelClipAndFakeQuantFunctor; template __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale, @@ -491,8 +489,8 @@ __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale, } template -struct FindRangeAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindRangeAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &cur_scale, const framework::Tensor &last_scale, const framework::Tensor &iter, @@ -535,7 +533,7 @@ struct FindRangeAbsMaxFunctor { sizeof(int), ctx.stream()); ctx.Wait(); - FindAbsMaxFunctor()( + FindAbsMaxFunctor()( ctx, scale_arr, len, out_scale_data); } } @@ -556,11 +554,11 @@ __global__ void FindMovingAverageAbsMaxKernel(const T *in_state, *out_scale = accum / state; } -template struct FindRangeAbsMaxFunctor; +template struct FindRangeAbsMaxFunctor; template -struct FindMovingAverageAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindMovingAverageAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in_accum, const framework::Tensor &in_state, const T *cur_scale, @@ -660,8 +658,8 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in, } template -struct ChannelClipFakeQuantDequantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ChannelClipFakeQuantDequantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -712,8 +710,7 @@ struct ChannelClipFakeQuantDequantFunctor { } }; -template struct ChannelClipFakeQuantDequantFunctor; +template struct ChannelClipFakeQuantDequantFunctor; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fc_op.cu.cc b/paddle/fluid/operators/fc_op.cu.cc index 4147903551d5e54802075b38f45ac67b9132173c..35c55135f212bda355684688f5df16a458cfd4fb 100644 --- a/paddle/fluid/operators/fc_op.cu.cc +++ b/paddle/fluid/operators/fc_op.cu.cc @@ -15,8 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fc, - ops::FCOpKernel, - ops::FCOpKernel, - ops::FCOpKernel); +REGISTER_OP_CUDA_KERNEL(fc, + ops::FCOpKernel, + ops::FCOpKernel, + ops::FCOpKernel); diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 93fb678e2117c8280b25ec1835c44c8b6fc8b5c6..43776e98a0225c086b1111c00e4cfb4dede792a1 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -42,7 +42,7 @@ void GetLinearOp(const std::vector &x, const std::vector &y, const framework::DDim &x_dim, const framework::DDim &y_dim, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, bool transpose_a, bool transpose_b, float alpha, @@ -87,7 +87,7 @@ void GetElementwiseAddOp(const std::vector &x, const std::vector &y, const int bsz_seq, const int output_size, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, std::vector *out) { framework::Scope scope; auto var_x = scope.Var("X"); @@ -128,7 +128,7 @@ void GetLinearOpGrad(const std::vector &x_vec, const framework::DDim &x_dim, const framework::DDim &y_dim, const framework::DDim &out_dim, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, bool transpose_a, bool transpose_b, float alpha, @@ -218,7 +218,7 @@ template void GetElementwiseAddOpGrad(const std::vector &dout_vec, const int bsz_seq, const int output_size, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, std::vector *dy_vec) { framework::Scope scope; auto var_x = scope.Var("X"); @@ -308,7 +308,7 @@ class TestFeedForward { bsz_seq_ = batch_size_ * seq_len_; output_size_ = 3 * num_head_ * dim_head_; input_size_ = dim_embed_; - ctx_ = new platform::CUDADeviceContext(place_); + ctx_ = new phi::GPUContext(place_); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place_, ctx_->stream()) .get()); @@ -559,7 +559,7 @@ class TestFeedForward { std::vector base_dinput_vec_, base_dweight_vec_, base_dbias_vec_; platform::CUDAPlace place_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; // test for fp32, fp16, fp32+bias and fp16+bias diff --git a/paddle/fluid/operators/fill_any_op.cu.cc b/paddle/fluid/operators/fill_any_op.cu.cc index ca1726508c40d16d56697cd6262a8f8c67619e70..2a561e6d3500e62bca32e599ca2b7254d31c4392 100644 --- a/paddle/fluid/operators/fill_any_op.cu.cc +++ b/paddle/fluid/operators/fill_any_op.cu.cc @@ -17,20 +17,18 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fill_any, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel); + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel); REGISTER_OP_CUDA_KERNEL( fill_any_grad, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel); + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 8e51c203d4122cd53c501129e1ea201b8247ddca..bd8303fe402f41da0cf22e9170d90d1f06d062a3 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -133,9 +133,9 @@ class FillConstantKernel : public framework::OpKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor->mutable_data(ctx.GetPlace(), framework::TransToPhiDataType(data_type)); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(ctx.GetPlace()); - functor(reinterpret_cast(dev_ctx), + functor(reinterpret_cast(dev_ctx), tensor, static_cast(value)); #else diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu index 8e30e0833d6fc090a76b3bd988d78a71edfd5f71..105b207636c1f0c0b10e7d3e93a8fe187385c75e 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cu +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -18,7 +18,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_constant_kernel(const int64_t featuresize, diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu index a7c26caa8fb45945e8ba306009f26114100cb26a..1b6ab71386b3b1cd5e5db4043f25ef78a13d07a4 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu +++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu @@ -18,7 +18,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_diagonal_tensor_kernel(int64_t size, @@ -109,7 +108,7 @@ class FillDiagonalTensorCUDAKernel : public framework::OpKernel { auto size = out->numel(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); Tensor tensor_tmp; int64_t *memory_block_cu = @@ -175,8 +174,7 @@ class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel { auto size = dx->numel(); - auto &dev_ctx = - ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); Tensor tensor_tmp; int64_t *memory_block_cu = diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 91809b8cd11bd0014b70d72eb76cae2c2668e5df..fad1bba49f39e6959d20f17bae5b0cdb19ea3776 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -21,28 +21,24 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fill_zeros_like, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); REGISTER_OP_CUDA_KERNEL( fill_zeros_like2, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc index e287ce1515a9c1b65c2b3e069c320f0a5f440a04..0a055c688eec0afdff2d1dd846b1c05852733293 100644 --- a/paddle/fluid/operators/flatten_op.cu.cc +++ b/paddle/fluid/operators/flatten_op.cu.cc @@ -17,35 +17,31 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_CUDA_KERNEL( - flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); -REGISTER_OP_CUDA_KERNEL( - flatten2, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel); -REGISTER_OP_CUDA_KERNEL( - flatten2_grad, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel); +REGISTER_OP_CUDA_KERNEL(flatten, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_CUDA_KERNEL(flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_CUDA_KERNEL(flatten2, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_CUDA_KERNEL(flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu index 2d2b334b021d79512133bd1b3f77974f3b5b0452..7728d57a276af8401659b80792a1636338655d81 100644 --- a/paddle/fluid/operators/fold_op.cu +++ b/paddle/fluid/operators/fold_op.cu @@ -16,12 +16,10 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fold, - ops::FoldOpKernel, - ops::FoldOpKernel); +REGISTER_OP_CUDA_KERNEL(fold, + ops::FoldOpKernel, + ops::FoldOpKernel); -REGISTER_OP_CUDA_KERNEL( - fold_grad, - ops::FoldGradOpKernel, - ops::FoldGradOpKernel); +REGISTER_OP_CUDA_KERNEL(fold_grad, + ops::FoldGradOpKernel, + ops::FoldGradOpKernel); diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu index a762054a1ea4fba0c243ff92c3ed2f639953c02c..d193136730732721fa4afcf7c8b31e83e8d8a641 100644 --- a/paddle/fluid/operators/fsp_op.cu +++ b/paddle/fluid/operators/fsp_op.cu @@ -18,8 +18,8 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(fsp, - ops::FSPOpKernel, - ops::FSPOpKernel); + ops::FSPOpKernel, + ops::FSPOpKernel); REGISTER_OP_CUDA_KERNEL(fsp_grad, - ops::FSPGradOpKernel, - ops::FSPGradOpKernel); + ops::FSPGradOpKernel, + ops::FSPGradOpKernel); diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h index 3db4992bd29c9bee9ac884fd6091684455636cc3..baed3ca7a1aa233a9f337d4db492e1c6cea25cc9 100644 --- a/paddle/fluid/operators/fused/attention_layer_norm.h +++ b/paddle/fluid/operators/fused/attention_layer_norm.h @@ -22,7 +22,7 @@ namespace operators { template class AttnLayerNorm { public: - AttnLayerNorm(const platform::CUDADeviceContext& dev_ctx, + AttnLayerNorm(const phi::GPUContext& dev_ctx, float epsilon, int64_t batch_size, int64_t feature_size) @@ -82,7 +82,7 @@ class AttnLayerNorm { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int64_t batch_size_; int64_t feature_size_; diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 6dd6cc28139b77198d9ac253640729ad86d5c5ba..fa50d5b23bfa2d43bd5676e2a129593c604354b4 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -95,7 +95,7 @@ __global__ void BroadcastKernelBinary( // bias add forward impl for "[m, n] + [n] = [m, n]" template -void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, +void LaunchBiasAddFwKernel(const phi::GPUContext& ctx, int m, int n, const T* in0, @@ -302,7 +302,7 @@ __global__ void BiasAddBw1DReduceKernel(const ReduceParamType* temp_sum, } template -void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx, +void Launch2DColumnReduce(const phi::GPUContext& dev_ctx, const int max_threads, const int reduce_num, const int left_num, @@ -345,11 +345,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx, // input // and d_bias[n] as output. template -void LaunchBiasAddBwKernel(const platform::CUDADeviceContext& dev_ctx, - int m, - int n, - const T* d_out, - T* d_bias) { +void LaunchBiasAddBwKernel( + const phi::GPUContext& dev_ctx, int m, int n, const T* d_out, T* d_bias) { int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int reduce_num = m; int left_num = n; diff --git a/paddle/fluid/operators/fused/attn_feed_forward.h b/paddle/fluid/operators/fused/attn_feed_forward.h index 568c283d3e41694eb07e7b5d104c96ef2e477774..753eb44710809a310400570de9389f5547eb5046 100644 --- a/paddle/fluid/operators/fused/attn_feed_forward.h +++ b/paddle/fluid/operators/fused/attn_feed_forward.h @@ -24,7 +24,7 @@ namespace operators { template class FeedForward { public: - FeedForward(const platform::CUDADeviceContext& dev_ctx, + FeedForward(const phi::GPUContext& dev_ctx, int bsz_seq, int output_size, int input_size, @@ -53,7 +53,7 @@ class FeedForward { // column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out) // here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out) - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.GEMM(transA, transB, bsz_seq_, @@ -78,7 +78,7 @@ class FeedForward { T* input, T* weight, T* d_output, T* d_input, T* d_weight, T* d_bias) { T alpha = static_cast(1.0); T beta = static_cast(0.0); - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); // column-major: gemm-nt, get d_weight. CBLAS_TRANSPOSE transA = CblasTrans; @@ -116,7 +116,7 @@ class FeedForward { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int bsz_seq_, output_size_, input_size_; bool compute_bias_; }; diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index 9adfe8e088d26c1f5f7d15e17b4c3dfa88177886..07947f522cdaecd9eede5a7ed13ed91f4a28413d 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -30,7 +30,7 @@ template class AttnMatMul { public: // (m, n, k) = bsz_seq, output_size, input_size - AttnMatMul(const platform::CUDADeviceContext& dev_ctx, + AttnMatMul(const phi::GPUContext& dev_ctx, bool transA, bool transB, int bsz_seq, @@ -60,7 +60,7 @@ class AttnMatMul { T beta = static_cast(0.0); // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out) - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.GEMM(transA, transB, bsz_seq_, @@ -91,7 +91,7 @@ class AttnMatMul { T beta_dA = use_addto ? static_cast(1.0) : static_cast(0.0); T beta_dB = static_cast(0.0); - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); if (!transA_) { // forward: gemm-nt if (transB_) { @@ -223,7 +223,7 @@ class AttnMatMul { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; bool transA_; bool transB_; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 121cbc909b8129c3ceec66dbe75c8eedbf10c296..81e8c5732665f81dd3a388ace1697a7faecd2fdb 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -43,7 +43,7 @@ template class CUDNNConvFusionOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* bias = ctx.Input("Bias"); @@ -109,17 +109,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); + ctx.AllocateTmpTensor(new_input_shape, dev_ctx); const int rank = transformed_input_channel.dims().size(); T pad_value(0.0); switch (rank) { case 4: { - phi::funcs::PadFunction( + phi::funcs::PadFunction( dev_ctx, input_pad, transformed_input_channel, @@ -127,7 +125,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { &transformed_input); } break; case 5: { - phi::funcs::PadFunction( + phi::funcs::PadFunction( dev_ctx, input_pad, transformed_input_channel, diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 0bda60f6b8b07baa3b98e4bcd94c7b48f73b2b0f..e11792a5dfb6195cf8acf8f122e1fc12886e2444 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -149,7 +149,7 @@ void ComputeInplaceRelu(framework::Tensor *cpu_x) { } } -void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, +void ComputeBatchNormForward(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, @@ -216,7 +216,7 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, saved_reserve_space->ShareDataWith(*reserve_space); } -void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, +void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_z, const Tensor &cpu_scale, @@ -280,7 +280,7 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, saved_reserve_space->ShareDataWith(*reserve_space); } -void ComputeFusedBNAddReluBackward(const platform::CUDADeviceContext &ctx, +void ComputeFusedBNAddReluBackward(const phi::GPUContext &ctx, const Tensor &cpu_dy, const Tensor &cpu_x, const Tensor &cpu_scale, @@ -384,10 +384,8 @@ class CudnnBNAddReluTester { << ", is_relative_atol=" << is_relative_atol << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ << ", has_shortcut=" << has_shortcut_; - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; @@ -469,10 +467,8 @@ class CudnnBNAddReluTester { } void CheckBackward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_dx_base; framework::Tensor cpu_dz_base; @@ -526,7 +522,7 @@ class CudnnBNAddReluTester { {channels_}, static_cast(0.0f), cpu_saved_var); } - void BaselineForward(const platform::CUDADeviceContext &ctx, + void BaselineForward(const phi::GPUContext &ctx, Tensor *cpu_mean_x, Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, @@ -573,7 +569,7 @@ class CudnnBNAddReluTester { } } - void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + void BaselineForwardFusedBNAddRelu(const phi::GPUContext &ctx, Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, @@ -594,7 +590,7 @@ class CudnnBNAddReluTester { saved_reserve_space); } - void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + void BaselineBackwardFusedBNAddRelu(const phi::GPUContext &ctx, Tensor *cpu_dx, Tensor *cpu_dz, Tensor *cpu_dscale, @@ -614,7 +610,7 @@ class CudnnBNAddReluTester { cpu_dbias); } - void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + void ComputeFusedBNStatsFinalize(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_bn_scale, const Tensor &cpu_bn_bias, @@ -671,7 +667,7 @@ class CudnnBNAddReluTester { } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedForward(const platform::CUDADeviceContext &ctx, + void FusedForward(const phi::GPUContext &ctx, Tensor *cpu_mean_x, Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, @@ -809,7 +805,7 @@ class CudnnBNAddReluTester { } // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedBackward(const platform::CUDADeviceContext &ctx, + void FusedBackward(const phi::GPUContext &ctx, Tensor *cpu_dx, Tensor *cpu_dz, Tensor *cpu_dscale, diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index 719c2fe64e5e918a6a07892ff60afa94b598f3ff..628642b9563166a27160256512f374ee5a02be4e 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -61,7 +61,7 @@ struct BNStatsFinalizeArgs { template class CudnnBNStatsFinalize { public: - CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx, + CudnnBNStatsFinalize(const phi::GPUContext &ctx, const std::vector ¶m_shape) : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING), inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) { @@ -69,7 +69,7 @@ class CudnnBNStatsFinalize { } ~CudnnBNStatsFinalize() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &sum, const Tensor &sum_of_squares, const Tensor &scale, @@ -130,7 +130,7 @@ class CudnnBNStatsFinalize { } private: - void TrainInit(const platform::CUDADeviceContext &ctx) { + void TrainInit(const phi::GPUContext &ctx) { // Set constant_param for train op train_op_.SetOpConstParamAttr({CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER, @@ -167,7 +167,7 @@ class CudnnBNStatsFinalize { &workspace_size_bytes); } - void InferenceInit(const platform::CUDADeviceContext &ctx) { + void InferenceInit(const phi::GPUContext &ctx) { // Set constant_param for inference op inference_op_.SetOpConstParamAttr({CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 3c924ddd9d9bba2e1d011304ecaedc9a03ec00e3..34cf677223c4227c3c5aca0278139ca3f1010305 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -37,7 +37,7 @@ struct NormConvolutionArgs { compute_type = platform::CudnnDataType::type; } - void Set(const platform::CUDADeviceContext &ctx, + void Set(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -124,7 +124,7 @@ struct NormConvolutionArgs { conv_desc.set(dtype, paddings, strides, dilations, false, group); } - bool IsSupport(const platform::CUDADeviceContext &ctx, + bool IsSupport(const phi::GPUContext &ctx, const std::vector &filter_shape, int stride, int dilation, @@ -167,7 +167,7 @@ struct NormConvolutionArgs { template class CudnnNormConvolution { public: - CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + CudnnNormConvolution(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -186,7 +186,7 @@ class CudnnNormConvolution { } ~CudnnNormConvolution() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &input, const Tensor &filter, Tensor *output, @@ -228,7 +228,7 @@ class CudnnNormConvolution { } private: - CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + CudnnFusionOp *GetForwardOp(const phi::GPUContext &ctx) { framework::AlgorithmsCache &cache = *(CudnnFusionOpCache::Instance().GetForward()); @@ -284,7 +284,7 @@ class CudnnNormConvolution { template class CudnnNormConvolutionGrad { public: - CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + CudnnNormConvolutionGrad(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -304,7 +304,7 @@ class CudnnNormConvolutionGrad { } ~CudnnNormConvolutionGrad() {} - void Backward(const platform::CUDADeviceContext &ctx, + void Backward(const phi::GPUContext &ctx, const Tensor &input, const Tensor &filter, const Tensor &output_grad, @@ -327,7 +327,7 @@ class CudnnNormConvolutionGrad { } private: - void BackwardFilter(const platform::CUDADeviceContext &ctx, + void BackwardFilter(const phi::GPUContext &ctx, T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) { @@ -355,7 +355,7 @@ class CudnnNormConvolutionGrad { workspace_size); } - void BackwardData(const platform::CUDADeviceContext &ctx, + void BackwardData(const phi::GPUContext &ctx, T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, @@ -387,7 +387,7 @@ class CudnnNormConvolutionGrad { workspace_size); } - CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + CudnnFusionOp *GetBackwardFilterOp(const phi::GPUContext &ctx) { framework::AlgorithmsCache &cache = *(CudnnFusionOpCache::Instance().GetBackward()); @@ -430,7 +430,7 @@ class CudnnNormConvolutionGrad { return wgrad_op; } - size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t GetWorkspaceSizeBwdData(const phi::GPUContext &ctx) { size_t workspace_size = 0U; auto handle = ctx.cudnn_handle(); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 7d404e6b3ed3d649c671b2e039997a89c475a885..ef93612ffce39ae81ece6c33c038a3714f755424 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -94,7 +94,7 @@ void CheckOutput(const framework::Tensor &cpu_res, } // Use Paddle conv2d op results as baseline -void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, +void ComputeConv2DForward(const phi::GPUContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, Tensor *cpu_output, @@ -130,7 +130,7 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, } // Use Paddle conv2d_grad op results as baseline -void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, +void ComputeConv2DBackward(const phi::GPUContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_output_grad, @@ -242,10 +242,8 @@ class CudnnNormConvolutionTester { ~CudnnNormConvolutionTester() {} void CheckForward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; @@ -266,10 +264,8 @@ class CudnnNormConvolutionTester { } void CheckBackward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_input_grad_base; framework::Tensor cpu_filter_nchw_grad_base; @@ -304,7 +300,7 @@ class CudnnNormConvolutionTester { &cpu_output_grad_); } - void BaselineForward(const platform::CUDADeviceContext &ctx, + void BaselineForward(const phi::GPUContext &ctx, framework::Tensor *cpu_output_base, framework::Tensor *cpu_sum_base, framework::Tensor *cpu_sum_of_square_base) { @@ -314,7 +310,7 @@ class CudnnNormConvolutionTester { *cpu_output_base, cpu_sum_base, cpu_sum_of_square_base); } - void BaselineBackward(const platform::CUDADeviceContext &ctx, + void BaselineBackward(const phi::GPUContext &ctx, framework::Tensor *cpu_input_grad_base, framework::Tensor *cpu_filter_grad_base) { ComputeConv2DBackward(ctx, @@ -329,7 +325,7 @@ class CudnnNormConvolutionTester { } // get forward results of cudnn_norm_conv - void FusedForward(const platform::CUDADeviceContext &ctx, + void FusedForward(const phi::GPUContext &ctx, framework::Tensor *cpu_output, framework::Tensor *cpu_sum, framework::Tensor *cpu_sum_of_square) { @@ -367,7 +363,7 @@ class CudnnNormConvolutionTester { sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - void FusedBackward(const platform::CUDADeviceContext &ctx, + void FusedBackward(const phi::GPUContext &ctx, framework::Tensor *cpu_input_grad, framework::Tensor *cpu_filter_grad) { framework::Tensor input; @@ -443,7 +439,7 @@ TEST(CudnnNormConvFp16, K1S1) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -473,7 +469,7 @@ TEST(CudnnNormConvFp16, K3S1) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -503,7 +499,7 @@ TEST(CudnnNormConvFp16, K1S1O4) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -533,7 +529,7 @@ TEST(CudnnNormConvFp16, K1S2O4) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() <= 70) { diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index 61e513e911a36e67ffd70dcb22fc928a09d6af2a..b25605c6ca0574ee3a925d11be4109a9ac3104f1 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -100,7 +100,7 @@ struct ScaleBiasAddReluArgs { template class CudnnScaleBiasAddRelu { public: - CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, + CudnnScaleBiasAddRelu(const phi::GPUContext &ctx, const std::string &act_type, bool fuse_add, bool has_shortcut, @@ -116,7 +116,7 @@ class CudnnScaleBiasAddRelu { ~CudnnScaleBiasAddRelu() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &x, const Tensor &x_scale, const Tensor &x_bias, @@ -171,7 +171,7 @@ class CudnnScaleBiasAddRelu { fwd_workspace_byte_); } - void Backward(const platform::CUDADeviceContext &ctx, + void Backward(const phi::GPUContext &ctx, const Tensor &dy, const Tensor &x, const Tensor &scale, @@ -237,7 +237,7 @@ class CudnnScaleBiasAddRelu { } private: - void ForwardInit(const platform::CUDADeviceContext &ctx) { + void ForwardInit(const phi::GPUContext &ctx) { // Set constant_param fwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, @@ -285,7 +285,7 @@ class CudnnScaleBiasAddRelu { CUDNN_BATCHNORM_SPATIAL_PERSISTENT); } - void BackwardInit(const platform::CUDADeviceContext &ctx) { + void BackwardInit(const phi::GPUContext &ctx) { // Set constant_param bwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER, diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index ef1befbb32033d17854f4f9d76b3eb38115e529d..7de59dd9ee2e3910991d99f12bbdebbab88a2484 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -67,7 +67,7 @@ class AttnDropoutParam { template class FMHARef { public: - FMHARef(const platform::CUDADeviceContext& dev_ctx, + FMHARef(const phi::GPUContext& dev_ctx, int64_t batch_size, int64_t seq_len, int64_t num_head, @@ -146,7 +146,7 @@ class FMHARef { // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; CBLAS_TRANSPOSE transB = CblasTrans; - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; int gemm_n = out_seq_len; @@ -274,7 +274,7 @@ class FMHARef { Tensor* transpose_2_out_grad_tensor, Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) { - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; int k_size = q_size; int softmax_axis = -1; @@ -479,7 +479,7 @@ class FMHARef { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int64_t batch_size_; int64_t seq_len_; diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 2c3fd75d8e012284491ffe2b838381773c24a567..ed904df93df40d571600fa589fdc6f06a59a802e 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -43,7 +43,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor &tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index c94aae0dd49a8d7b338033a9e29076e6036001a9..53984707d50a504b56b86312085d9964a6708a4b 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -37,7 +37,7 @@ template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template -class FusedBatchNormActKernel +class FusedBatchNormActKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -88,7 +88,7 @@ class FusedBatchNormActKernel const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, // skip the batch norm calculation, let y = act(x). @@ -217,7 +217,7 @@ class FusedBatchNormActKernel }; template -class FusedBatchNormActGradKernel +class FusedBatchNormActGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -268,7 +268,7 @@ class FusedBatchNormActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { if (act_type == "relu") { auto x_v = framework::EigenVector::Flatten(*x); @@ -281,9 +281,7 @@ class FusedBatchNormActGradKernel PADDLE_THROW( platform::errors::Unimplemented("Unsupported activation type")); } - phi::funcs::SetConstant> - functor; + phi::funcs::SetConstant> functor; functor(dev_ctx, d_scale, static_cast>(0)); functor(dev_ctx, d_bias, static_cast>(0)); return; @@ -402,12 +400,12 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act, - ops::FusedBatchNormActKernel, - ops::FusedBatchNormActKernel, - ops::FusedBatchNormActKernel); + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel); REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act_grad, - ops::FusedBatchNormActGradKernel, - ops::FusedBatchNormActGradKernel, - ops::FusedBatchNormActGradKernel); + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index e703ce810cd2f475e72bf23ea096e9ce9d3c0fbe..23dbbe2ad084c31e700b12f48c1d074b706b35db 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -36,7 +36,7 @@ template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template -class FusedBatchNormAddActKernel +class FusedBatchNormAddActKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -81,7 +81,7 @@ class FusedBatchNormAddActKernel const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // ------------------- cudnn descriptors --------------------- auto handle = dev_ctx.cudnn_handle(); @@ -194,7 +194,7 @@ class FusedBatchNormAddActKernel }; template -class FusedBatchNormAddActGradKernel +class FusedBatchNormAddActGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -243,7 +243,7 @@ class FusedBatchNormAddActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; @@ -353,9 +353,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_bn_add_activation, - ops::FusedBatchNormAddActKernel); + ops::FusedBatchNormAddActKernel); REGISTER_OP_CUDA_KERNEL( fused_bn_add_activation_grad, - ops::FusedBatchNormAddActGradKernel); + ops::FusedBatchNormAddActGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 5016cb65fb7a078a2f28a4d4546311e3942ab630..732da5fa52a8bcdb27cacc5b196ba7240de02746 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -125,7 +125,7 @@ void LaunchDropoutActBias(Functor act_functor, const T *bias, T *dst, MaskType *mask_data, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { SetZero(ctx, dst, rows * cols); @@ -277,7 +277,7 @@ void LaunchDropoutActBiasGrad(Functor act_functor, const uint32_t cols, T *dx, T *dbias, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { const T zero = static_cast(0.0); auto factor = dropout_prob == static_cast(1.0f) ? zero diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 18f51b5d02bbef1cef6df9ca902a9b8f56a3e93e..06810c18cc05a6af5c6167b81decefe971762393 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -57,7 +57,7 @@ struct TestFusedDropoutActBias { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; TestFusedDropoutActBias() { rows = 32; @@ -69,7 +69,7 @@ struct TestFusedDropoutActBias { has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } TestFusedDropoutActBias(int rows_, @@ -87,7 +87,7 @@ struct TestFusedDropoutActBias { has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } ~TestFusedDropoutActBias() {} diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index faac7691ae2c4259509af77f66fe8693b155070d..0f37d242ebcb3c9c334280b18926149d761b0f97 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -40,7 +40,7 @@ namespace operators { * 2D grids: gridDim.y = rows */ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const uint32_t rows, const uint32_t cols, const int vec_size) { @@ -101,9 +101,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state, } template -inline void SetZero(const platform::CUDADeviceContext &ctx, - T *ptr, - const size_t size) { +inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) { PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream())); } diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index aa4c6622f704ab6ad149992d5df6f767ff6469f6..208b2a58bca6918482bac195ab2f7eb56c8cf364 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -100,8 +100,7 @@ struct DropoutParam { seed_val = context.Attr(pre_fix + "seed"); } - int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx, - const int offset) { + int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) { uint64_t tmp_increment; GetSeedDataAndIncrement( ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment); @@ -113,7 +112,7 @@ struct DropoutParam { template class FusedDropoutHelper { private: - int GetIncrement(const platform::CUDADeviceContext& ctx) { + int GetIncrement(const phi::GPUContext& ctx) { const int VecSize = MAX_CACHE_BYTES / sizeof(T); const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1; auto config = Get1DBlocksAnd2DGrids(ctx, @@ -130,7 +129,7 @@ class FusedDropoutHelper { public: FusedDropoutHelper() {} - FusedDropoutHelper(const platform::CUDADeviceContext& ctx, + FusedDropoutHelper(const phi::GPUContext& ctx, const int rows, const int cols, const DropoutParam& dropout_param) { @@ -140,7 +139,7 @@ class FusedDropoutHelper { } // out = residual + dropout( src + bias ) - void ResidualDropoutBias(const platform::CUDADeviceContext& ctx, + void ResidualDropoutBias(const phi::GPUContext& ctx, const T* src, const T* residual, const T* bias, @@ -162,7 +161,7 @@ class FusedDropoutHelper { ctx); } - void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + void ResidualDropoutBiasGrad(const phi::GPUContext& ctx, const T* d_out, const MaskType* mask, T* d_src, @@ -189,7 +188,7 @@ class FusedDropoutHelper { } // out = dropout(activation(src + bias)) - void DropoutActBias(const platform::CUDADeviceContext& ctx, + void DropoutActBias(const phi::GPUContext& ctx, const T* src, const T* bias, const std::string& act_method, @@ -234,7 +233,7 @@ class FusedDropoutHelper { } } - void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx, + void DropoutActBiasGrad(const phi::GPUContext& ctx, const T* dout, const T* src, const T* bias, @@ -297,7 +296,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { epsilon_ = epsilon; } - FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx, + FusedDropoutLayerNormHelper(const phi::GPUContext& ctx, const int rows, const int cols, const DropoutParam& dropout_param, @@ -308,7 +307,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } // call layer_norm - void LayerNorm(const platform::CUDADeviceContext& ctx, + void LayerNorm(const phi::GPUContext& ctx, const T* src, const LayerNormParamType* gamma, const LayerNormParamType* beta, @@ -324,7 +323,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } } - void LayerNormGrad(const platform::CUDADeviceContext& ctx, + void LayerNormGrad(const phi::GPUContext& ctx, const T* dout, const T* src, const LayerNormParamType* gamma, @@ -350,7 +349,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { // out = layernorm(residual + dropout(src + bias)) template , bool is_same_type = false> - void LayernormResidualDropoutBias(const platform::CUDADeviceContext& ctx, + void LayernormResidualDropoutBias(const phi::GPUContext& ctx, const T* src, const T* residual, const T* bias, @@ -392,7 +391,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } template , bool is_same_type = false> - void LayernormResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + void LayernormResidualDropoutBiasGrad(const phi::GPUContext& ctx, const T* d_out, const T* layernorm_src, const MaskType* mask, diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index e3ab187f0d70be185e30abc3387787b1cce0c520..8fac3165f1c08731b72496e25abe8d0d220a11a9 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -49,7 +49,7 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, std::vector *out, std::vector *mask, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, uint64_t seed, float dropout_prob, bool is_upscale_in_train, @@ -97,7 +97,7 @@ void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, const std::vector &dout, const std::vector &mask, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, float dropout_prob, bool is_upscale_in_train) { framework::Scope scope; @@ -148,7 +148,7 @@ void LayerNorm(const std::vector> &scale, const float epsilon, const int rows, const int cols, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { framework::Scope scope; auto place = ctx.GetPlace(); paddle::optional scale_opt; diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu index 7b44aa82e4a22ba195fbe8b86ef78ad7f37397f8..80b10021c09eb03f784dedfd0fc68553a5ed8747 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu @@ -17,36 +17,28 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_elemwise_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_add_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_add_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index d102c5e4705e9fe05e5e18b38c2a066ee5c301cd..abc9b451d1776c91b61aa62c75db882215a4a27c 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -121,5 +121,4 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_embedding_eltwise_layernorm, - ops::EmbeddingEltWiseLayerNormKernel); + ops::EmbeddingEltWiseLayerNormKernel); diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 37aa5cbd14dac80e6efa55e6b481ccbcf92873b9..3e117c45359b189f9771eb245aa5528f59e7320e 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -397,8 +397,8 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { const T* w_data = w->data(); T* out_data = out->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto& dev_ctx = ctx.template device_context(); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.GEMM(false, false, M, diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 4126f5ad7263aee63f4ba79757c0b61d3ff7fa7a..60b5ecfdd74bfaa484fa5d120dcb5edab8006aac 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -36,7 +36,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor& tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext& ctx) { + const phi::GPUContext& ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); @@ -73,7 +73,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT template class FusedFeedForwardKernel : public framework::OpKernel { public: - void MatMul(const platform::CUDADeviceContext& ctx, + void MatMul(const phi::GPUContext& ctx, const framework::Tensor& a, const framework::Tensor& b, framework::Tensor* c) const { @@ -86,7 +86,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); } - void FFN(const platform::CUDADeviceContext& ctx, + void FFN(const phi::GPUContext& ctx, const framework::Tensor& x, const framework::Tensor& linear1_weight, const framework::Tensor* linear1_bias, @@ -309,7 +309,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { template class FusedFeedForwardGradKernel : public framework::OpKernel { public: - void MatMulGrad(const platform::CUDADeviceContext& ctx, + void MatMulGrad(const phi::GPUContext& ctx, const framework::Tensor& d_out, const framework::Tensor& a, const framework::Tensor& b, @@ -327,7 +327,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); } - void FFNGrad(const platform::CUDADeviceContext& ctx, + void FFNGrad(const phi::GPUContext& ctx, const framework::Tensor& d_out, const framework::Tensor& x, const framework::Tensor& dropout1_mask, @@ -630,14 +630,12 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_feedforward, - ops::FusedFeedForwardKernel, - ops::FusedFeedForwardKernel, - ops::FusedFeedForwardKernel); + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel); REGISTER_OP_CUDA_KERNEL( fused_feedforward_grad, - ops::FusedFeedForwardGradKernel, - ops::FusedFeedForwardGradKernel, - ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel); diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 45d47908b99e05df3f9c5170623813589f70edeb..f9d9fad110e4991bf149f51863da3267241abed4 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -44,7 +44,7 @@ inline std::string MemoryDebugString(const Tensor& t) { } template -void AllocWithDebugInfo(const platform::CUDADeviceContext& dev_ctx, +void AllocWithDebugInfo(const phi::GPUContext& dev_ctx, const std::string& info, Tensor* t) { t->mutable_data(dev_ctx.GetPlace()); @@ -59,7 +59,7 @@ struct TernaryAddFunctor { template struct GateAttentionConfig { public: - const platform::CUDADeviceContext& dev_ctx; + const phi::GPUContext& dev_ctx; bool merge_qkv; bool has_gating; @@ -86,7 +86,7 @@ struct GateAttentionConfig { phi::DDim qktv_out_dims; phi::DDim gate_out_dims; - GateAttentionConfig(const platform::CUDADeviceContext& dev_ctx, + GateAttentionConfig(const phi::GPUContext& dev_ctx, const Tensor* query, const Tensor* key, const Tensor* query_weight, @@ -249,7 +249,7 @@ struct GateAttentionConfig { template struct GateAttentionGradConfig : public GateAttentionConfig { public: - GateAttentionGradConfig(const platform::CUDADeviceContext& dev_ctx, + GateAttentionGradConfig(const phi::GPUContext& dev_ctx, const Tensor* query, const Tensor* key, const Tensor* query_weight, @@ -322,7 +322,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig { template class FMHAGateRef { public: - FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv) + FMHAGateRef(const phi::GPUContext& dev_ctx, bool merge_qkv) : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {} void ComputeForward(const Tensor* nonbatched_bias, @@ -748,7 +748,7 @@ class FMHAGateRef { int64_t stride_a = m * k; int64_t stride_b = k * n; - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, m, @@ -764,7 +764,7 @@ class FMHAGateRef { stride_b); } - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; bool merge_qkv_; }; diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 7400246f40725cc1a966f78e84071c9d5b68acba..139a365c10e12b00bcbc924b2bec12853f50351f 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -350,7 +350,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { const bool merge_qkv = ctx.Attr("merge_qkv"); const bool has_gating = ctx.Attr("has_gating"); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); AllocWithDebugInfo(dev_ctx, "softmax_out", softmax_out); AllocWithDebugInfo(dev_ctx, "fmha_out", fmha_out); if (has_gating) { @@ -441,7 +441,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { bool has_gating = ctx.Attr("has_gating"); bool merge_qkv = ctx.Attr("merge_qkv"); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); AllocWithDebugInfo(dev_ctx, "query_grad", query_grad); GateAttentionGradConfig config( diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 3ebb9f9e640cc0c8e56b06cd36614fba327cad4e..219a517315b52f9c6aa6258883ff983a4b638298 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -29,7 +29,7 @@ template class FusedGemmEpilogueKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); @@ -320,7 +320,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { template static void ComputeImpl(const framework::ExecutionContext& ctx) { using Trait = FusedGEMMGradTrait; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const Tensor* dout = ctx.Input("DOut"); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); @@ -677,17 +677,14 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue, - ops::FusedGemmEpilogueKernel, - ops::FusedGemmEpilogueKernel, - ops::FusedGemmEpilogueKernel); + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue_grad, - ops::FusedGemmEpilogueGradKernel, - ops::FusedGemmEpilogueGradKernel, - ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 301b62524a54dda8c4abba23983b58cc36090d76..7bb3498567cc3fcc3a0674ad0f36f24bb15c35b8 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -688,7 +688,7 @@ void LaunchLayernormResidualDropoutBias( T *layernorm_dst, LayerNormParamType *mean, LayerNormParamType *var, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { auto cuda_place = ctx.GetPlace(); @@ -846,7 +846,7 @@ template void LaunchLayernormResidualDropoutGrad( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const uint32_t rows, const uint32_t cols, const float epsilon, diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index 4f8ceba177e21db1cdd74af0143dd3cee52927e1..d3c6cca95efb0bf6c965cd2304bf08ad10bc2324 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -54,7 +54,7 @@ struct TestFusedLayernormResidualDropoutBias { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; TestFusedLayernormResidualDropoutBias() { rows = 32; @@ -69,7 +69,7 @@ struct TestFusedLayernormResidualDropoutBias { epsilon = 0.00001f; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } TestFusedLayernormResidualDropoutBias(int _rows, @@ -92,7 +92,7 @@ struct TestFusedLayernormResidualDropoutBias { has_layernorm_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } ~TestFusedLayernormResidualDropoutBias() {} diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index a8bebd5012db5bc00eb8b8ffd1a759f69415f4b3..a858b31e23c8ada64e3d74973ea3197d2c403347 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -49,7 +49,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor &tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); @@ -996,7 +996,7 @@ void fmha_launch_kernel(const Masked_multihead_attention_params ¶ms, } template -void fmha(const platform::CUDADeviceContext &dev_ctx, +void fmha(const phi::GPUContext &dev_ctx, const Tensor &qkv_tensor, const Tensor &qkv_bias_tensor, const Tensor &src_mask_tensor, @@ -1118,7 +1118,7 @@ __global__ void write_cache_v_kernel(T *cache_v, } template -void write_cache_kv(const platform::CUDADeviceContext &dev_ctx, +void write_cache_kv(const phi::GPUContext &dev_ctx, T *cache_k, T *cache_v, const T *k, diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index f9bf4c3c5a377ff667e3ff2b582e05395b1c4ef0..c1131cae5d86f737e19ab6ffc84332cebd85286d 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -178,7 +178,7 @@ void LaunchResidualDropoutBias(const uint32_t rows, const T *bias, MaskType *mask_data, T *dst, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { if (residual == dst) return; @@ -323,7 +323,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const uint32_t cols, T *dx, T *dbias, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { const T zero = static_cast(0.0f); auto factor = dropout_prob == static_cast(1.0f) ? zero diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 2ff0d3dc036bcaf92c745b8f7179969d679ef617..ba0652339e96e4495a31184dd0b8228fd833ff67 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -58,7 +58,7 @@ struct FusedResidualDropoutBiasTester { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; FusedResidualDropoutBiasTester() { rows = 32; @@ -69,7 +69,7 @@ struct FusedResidualDropoutBiasTester { is_test = false; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto device_ctx = pool.Get(place); - ctx = reinterpret_cast(device_ctx); + ctx = reinterpret_cast(device_ctx); } FusedResidualDropoutBiasTester(int rows, @@ -86,7 +86,7 @@ struct FusedResidualDropoutBiasTester { is_test(is_test) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto device_ctx = pool.Get(place); - ctx = reinterpret_cast(device_ctx); + ctx = reinterpret_cast(device_ctx); } void SetUp() { diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 6aba49ea33f52209ba3f587b0c2170a2f3206343..a6a49b7ac62485733f4ba14ab74a6a6e7db97332 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -114,9 +114,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext const float padding_value, const bool use_cvm, const int cvm_offset) { - auto stream = - ctx.template device_context().stream(); - auto &dev_ctx = ctx.template device_context(); + auto stream = ctx.template device_context().stream(); + auto &dev_ctx = ctx.template device_context(); size_t total_ptr_len = input_data.size() + output_data.size() + seqpool_output_data.size() + lods.size(); auto temp_ptr = @@ -320,9 +319,8 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, const int embedding_size, const bool use_cvm, const int cvm_offset) { - auto stream = - ctx.template device_context().stream(); - auto &dev_ctx = ctx.template device_context(); + auto stream = ctx.template device_context().stream(); + auto &dev_ctx = ctx.template device_context(); size_t total_ptr_len = out_grads_data.size() + in_grads_data.size() + cvm_data.size() + lods.size(); auto temp_ptr = diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 22da713f87dfdc6a99aa4e9a04a5e434dbd04125..ce892024d8d9e09d3ccdb009fb0a622ea917b3c3 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -39,7 +39,7 @@ template class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto* input = ctx.Input("Input"); auto filters = ctx.MultiInput("Filter"); auto bias = ctx.MultiInput("Bias"); diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc index 9a81a50efba1b7158941b6b6c0b56b7e67316647..9ce8842a015fdf868772035429c8bd2d4e7e41d1 100644 --- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc @@ -18,8 +18,7 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - fusion_group, - ops::FusionGroupKernel, - ops::FusionGroupKernel, - ops::FusionGroupKernel); +REGISTER_OP_CUDA_KERNEL(fusion_group, + ops::FusionGroupKernel, + ops::FusionGroupKernel, + ops::FusionGroupKernel); diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 89058cc3fd94cbda9c82f40e7605b3b4db302088..9a1e58c6320fff243de4a8c524b42095c063312d 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -52,7 +52,7 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); cudnnDataType_t cudnn_dtype = CudnnDataType::type; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); T* odata = out->data(); diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index 4fd51aec24aa37d90c4a06de1fc47e9f34551502..8a6d5b313ad36809ffe7bbf253ba792d4f6aa840 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -237,7 +237,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { auto *temp_out_data = temp_out_tensor.mutable_data(context.GetPlace()); // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) - auto blas = phi::funcs::GetBlas(device_ctx); + auto blas = phi::funcs::GetBlas(device_ctx); blas.MatMul(input_matrix, w_matrix, &temp_out_tensor); // temp_out_tensor.Resize(temp_out_dims); @@ -285,6 +285,5 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - multihead_matmul, - ops::MultiHeadMatMulV2Kernel); +REGISTER_OP_CUDA_KERNEL(multihead_matmul, + ops::MultiHeadMatMulV2Kernel); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu index 188c2b21be081faff90e78e860a0686560470cd3..d0a8788e0db2fa389cf88713f887c324802a11d8 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cu +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -90,7 +90,7 @@ class ResNetUnitKernel : public framework::OpKernel { output_channel; auto place = ctx.GetPlace(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // 1. Conv Tensor sum_x; @@ -268,7 +268,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { auto bitmask_shape = phi::vectorize(bitmask->dims()); auto place = ctx.GetPlace(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, // scale_x_grad, bias_x_grad diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu index 117228d2864a6a39b9db5fd999869ba08e3f4af1..1f9640dd4ba5227a34ab8041b8e8862ef7b0c346 100644 --- a/paddle/fluid/operators/fused/skip_layernorm_op.cu +++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu @@ -69,6 +69,5 @@ class SkipLayerNormKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - skip_layernorm, - ops::SkipLayerNormKernel); +REGISTER_OP_CUDA_KERNEL(skip_layernorm, + ops::SkipLayerNormKernel); diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu index d20ffa274a86208d934d71c4c0f1bf1d7f29575d..b82b9a931a1f1bc40f88dfe920bc9379882b7654 100644 --- a/paddle/fluid/operators/fused/yolo_box_head_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu @@ -72,8 +72,7 @@ class YoloBoxHeadKernel : public framework::OpKernel { auto* out = context.Output("Out"); auto anchors = context.Attr>("anchors"); auto class_num = context.Attr("class_num"); - auto& device_ctx = - context.template device_context(); + auto& device_ctx = context.template device_context(); auto x_dims = x->dims(); const int batch_size = x_dims[0]; const int h = x_dims[2]; diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu index 6ebf9b8eb31ad431127bfb532c964f36134d5a10..c259d0efb490b2aeb445ca898b8a9ff307f6d18d 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_op.cu @@ -587,9 +587,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_softmax_mask, - ops::SoftmaxMaskFuseKernel, - ops::SoftmaxMaskFuseKernel); + ops::SoftmaxMaskFuseKernel, + ops::SoftmaxMaskFuseKernel); REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_grad, - ops::SoftmaxMaskFuseGradKernel, - ops::SoftmaxMaskFuseGradKernel); + ops::SoftmaxMaskFuseGradKernel, + ops::SoftmaxMaskFuseGradKernel); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 1849108ed66ebe54bb2fbe777f00ed2b616bed87..54db576d3171b758dbaf33da773017f619917f1c 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -577,12 +577,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle, - ops::SoftmaxMaskFuseUpperTriangleKernel, - ops::SoftmaxMaskFuseUpperTriangleKernel); + ops::SoftmaxMaskFuseUpperTriangleKernel, + ops::SoftmaxMaskFuseUpperTriangleKernel); REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle_grad, - ops::SoftmaxMaskFuseUpperTriangleGradKernel, - ops::SoftmaxMaskFuseUpperTriangleGradKernel); + ops::SoftmaxMaskFuseUpperTriangleGradKernel, + ops::SoftmaxMaskFuseUpperTriangleGradKernel); diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index a7b64223be7bd272ab53753668e9fb06c1e7b8f6..fa28481f4c4b6aebdf918dd35bea90300c684060 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -143,8 +143,7 @@ struct gpu_gather_scatter_functor { int block = 512; int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); GatherScatterGPUKernel <<>>(self_data, dim, @@ -257,8 +256,7 @@ void gpu_scatter_input_grad_kernel(Tensor self, int block = 512; int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); ScatterInputGradGPUKernel <<>>(grad_data, diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 1e89091b202dee4fdf7a99dea1637517cc64aba0..81b53c8b949763840ed71d4107e42f1ebe8ee5b9 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -61,8 +61,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::DefaultCUDAGenerator(device_id); - auto& dev_cxt = - context.template device_context(); + auto& dev_cxt = context.template device_context(); if (seed == 0) { // use global Generator seed diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index 06720f1db119a7f3d9447a2cf4acfc87e02bcf86..fc8f195fb70a8583c26cf42e62f817a1f2bdfebe 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -245,8 +245,7 @@ void SampleNeighbors(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(0, k, bs, @@ -305,8 +304,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, num_input, len_hashtable, @@ -319,8 +317,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, thrust::raw_pointer_cast(item_count.data()), num_input, @@ -338,8 +335,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, num_input, len_hashtable, @@ -398,8 +394,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(thrust::raw_pointer_cast(outputs->data()), outputs->size(), size, @@ -411,8 +406,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()), bs, thrust::raw_pointer_cast(reindex_nodes->data()), @@ -625,8 +619,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel { <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>( unique_dst_size, thrust::raw_pointer_cast(unique_dst_merge_reindex.data()), @@ -650,7 +643,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(graph_khop_sampler, diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 48872cb8caa98cf6494a179961a47f163743a006..da9ccdf627f445510b2b6b06675bcafc92c5a5f3 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -41,7 +41,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { true, platform::errors::InvalidArgument( "It must use CUDAPlace when using CUDA Kernel")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* input = ctx.Input("X"); auto* grid = ctx.Input("Grid"); @@ -90,7 +90,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { true, platform::errors::InvalidArgument( "It must use CUDAPlace when using CUDA Kernel")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* input = ctx.Input("X"); auto* grid = ctx.Input("Grid"); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index abf367f70e2ccc83d5e0c1006e62efaee42e944f..668f69b4c75d9710d90c52ed617bfaa2027678ec 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -261,8 +261,7 @@ __global__ void GroupNormForward(const T* x, } template -class GroupNormKernel - : public framework::OpKernel { +class GroupNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const std::string data_layout_str = ctx.Attr("data_layout"); @@ -291,8 +290,8 @@ class GroupNormKernel y->mutable_data(ctx.GetPlace()); mean->mutable_data(ctx.GetPlace()); var->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); Tensor temp_var; temp_var.mutable_data(var->dims(), ctx.GetPlace()); auto* x_data = x->data(); @@ -597,8 +596,7 @@ __global__ void GetXGradientCUDAKernel(int imsize, } template -class GroupNormGradKernel - : public framework::OpKernel { +class GroupNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const std::string data_layout_str = ctx.Attr("data_layout"); @@ -629,8 +627,8 @@ class GroupNormGradKernel : x_dims[x_dims.size() - 2]); d_x->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); Tensor ds, db; ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); @@ -816,11 +814,9 @@ class GroupNormGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - group_norm, - ops::GroupNormKernel, - ops::GroupNormKernel); -REGISTER_OP_CUDA_KERNEL( - group_norm_grad, - ops::GroupNormGradKernel, - ops::GroupNormGradKernel); +REGISTER_OP_CUDA_KERNEL(group_norm, + ops::GroupNormKernel, + ops::GroupNormKernel); +REGISTER_OP_CUDA_KERNEL(group_norm_grad, + ops::GroupNormGradKernel, + ops::GroupNormGradKernel); diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index 37ba915a24f1c8b72e0fa5947e69ad24fbc5f1f7..f3665da1816410e8e4cacd8d90221fe29b98e24a 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -131,11 +131,9 @@ class GRUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gru, - ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_CUDA_KERNEL( - gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CUDA_KERNEL(gru, + ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL(gru_grad, + ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu index 979a20a64ee990caa60d9ea8aaca8f9a9ed9c422..adaaf1d09cd76460d2694bf09055d5acec4146de 100644 --- a/paddle/fluid/operators/gru_unit_op.cu +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -14,11 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/gru_unit_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_CUDA_KERNEL( - gru_unit_grad, - ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CUDA_KERNEL(gru_unit, + ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL(gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 835312851b2e42210c54992938cb217faf73ae91..0d1006658a492d17332c975432b23d62b2a35ccf 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -155,9 +155,7 @@ REGISTER_OP_CPU_KERNEL(hinge_loss, REGISTER_OP_CPU_KERNEL(hinge_loss_grad, ops::HingeLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_CUDA_KERNEL( - hinge_loss_grad, - ops::HingeLossGradKernel); +REGISTER_OP_CUDA_KERNEL(hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL(hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 503b64c343118d98753329f4fbf347e32529b60e..b58f9a55756ad58cc79127a6a81e3155de396210 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -200,9 +200,7 @@ REGISTER_OP_CPU_KERNEL(im2sequence, REGISTER_OP_CPU_KERNEL(im2sequence_grad, ops::Im2SequenceGradKernel); -REGISTER_OP_CUDA_KERNEL( - im2sequence, - ops::Im2SequenceKernel); -REGISTER_OP_CUDA_KERNEL( - im2sequence_grad, - ops::Im2SequenceGradKernel); +REGISTER_OP_CUDA_KERNEL(im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CUDA_KERNEL(im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index a63cd8b0071c9799177801c6ebd069cf0d7e6bae..044b8118abb0ea7e232715b669d501c9c8ee18cf 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -225,16 +225,14 @@ namespace plat = paddle::platform; #ifdef PADDLE_WITH_HIP // MIOPEN do not support double REGISTER_OP_CUDA_KERNEL(inplace_abn, - ops::InplaceABNKernel); -REGISTER_OP_CUDA_KERNEL( - inplace_abn_grad, - ops::InplaceABNGradKernel); + ops::InplaceABNKernel); +REGISTER_OP_CUDA_KERNEL(inplace_abn_grad, + ops::InplaceABNGradKernel); #else REGISTER_OP_CUDA_KERNEL(inplace_abn, - ops::InplaceABNKernel, - ops::InplaceABNKernel); -REGISTER_OP_CUDA_KERNEL( - inplace_abn_grad, - ops::InplaceABNGradKernel, - ops::InplaceABNGradKernel); + ops::InplaceABNKernel, + ops::InplaceABNKernel); +REGISTER_OP_CUDA_KERNEL(inplace_abn_grad, + ops::InplaceABNGradKernel, + ops::InplaceABNGradKernel); #endif diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 45b2a05211ed5cc024591543be48feb692e2ca53..80534d29b5ae4a54bfddf579d7007eb35056821a 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -1337,8 +1337,8 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_w == out_w) { @@ -1432,8 +1432,8 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_h == out_h && in_w == out_w) { @@ -1581,8 +1581,8 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, dim_grad = {n, in_d, in_h, in_w, c}; } auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_d == out_d && in_h == out_h && in_w == out_w) { diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu index 88447aa830f8ba054e2c8adb393875f83576b101..d8e18f58fa9f2d3ffd712c07ba39eed7124e82b4 100644 --- a/paddle/fluid/operators/isfinite_op.cu +++ b/paddle/fluid/operators/isfinite_op.cu @@ -17,44 +17,23 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(isinf, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isinf, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); -REGISTER_OP_CUDA_KERNEL(isnan, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isnan, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); -REGISTER_OP_CUDA_KERNEL(isfinite, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isfinite, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index c7bf0d538bd97912463a4675e4907bd65136983a..093a33d89b03f8b8e77d4614a7a9a5360ad20421 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -97,8 +97,6 @@ REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel); REGISTER_OP_CPU_KERNEL(l1_norm_grad, ops::L1NormGradKernel); -REGISTER_OP_CUDA_KERNEL( - l1_norm, ops::L1NormKernel); -REGISTER_OP_CUDA_KERNEL( - l1_norm_grad, - ops::L1NormGradKernel); +REGISTER_OP_CUDA_KERNEL(l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL(l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 01abe645495a483fdf785248a6c4d681b52961cd..d14cc0762617e1ca7daaad8730f3cc42857a1bef 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -61,8 +61,7 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel { auto n_expert = expert_count->numel() / n_worker; const auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); dim3 grid_dim(256); dim3 block_dim(1024); diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 240f6b06325f4b8d5ccb0a74621b56ba2601111f..008305bdb93eb7d409eebe11def8177bd24d3643 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -85,7 +85,7 @@ class LiteEngineOp : public framework::OperatorBase { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(dev_place)) { platform::GpuStreamSync( - static_cast(ctx)->stream()); + static_cast(ctx)->stream()); } #endif VLOG(3) << "lite engine run"; @@ -103,7 +103,7 @@ class LiteEngineOp : public framework::OperatorBase { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(dev_place)) { platform::GpuStreamSync( - static_cast(ctx)->stream()); + static_cast(ctx)->stream()); } #endif } diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index fed71abe16637ff5d791fcca51d5398e162d5eb0..d631c3c7317df2879659f9f5118384c1e3205b6f 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -72,7 +72,7 @@ TEST(LiteEngineOp, engine_op) { framework::Scope scope; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h index 574b7cbec28ce9f7ffcd1157f0be677a957a50f1..b8892e9c88f08a1d498de4e7bc27501ac16c83b7 100644 --- a/paddle/fluid/operators/lite/ut_helper.h +++ b/paddle/fluid/operators/lite/ut_helper.h @@ -58,7 +58,7 @@ void serialize_params(std::string* str, std::ostringstream os; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); #else phi::CPUContext ctx; #endif diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu index 2a42c0daa7fc58165e85d851c602a65ec287c905..9405b3564b9626ed2878ecbc2b70ddbf85efae53 100644 --- a/paddle/fluid/operators/load_combine_op.cu +++ b/paddle/fluid/operators/load_combine_op.cu @@ -16,10 +16,9 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); +REGISTER_OP_CUDA_KERNEL(load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu index c122978d12c7c236ca703722ef01cc717dcc08fc..04c456ac60306d3b3fbf06aaf9e5f05828ddcb51 100644 --- a/paddle/fluid/operators/load_op.cu +++ b/paddle/fluid/operators/load_op.cu @@ -16,10 +16,9 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - load, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel); +REGISTER_OP_CUDA_KERNEL(load, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu index a910ad549f1bad32854524b0268e77ff3ef4d91c..25aad4c4afce1d4ce93cb5f77b0f907ad936790c 100644 --- a/paddle/fluid/operators/lod_reset_op.cu +++ b/paddle/fluid/operators/lod_reset_op.cu @@ -16,15 +16,13 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel); -REGISTER_OP_CUDA_KERNEL( - lod_reset_grad, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel); +REGISTER_OP_CUDA_KERNEL(lod_reset, + ops::LoDResetKernel, + ops::LoDResetKernel, + ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL(lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index d4b36f31e6201f8e52eb388481eb80a3dfcbfa46..ab4d95c592fc19a8af97b090017d86a3ec0c1660 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor Apply(static_cast(dev_ctx)); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - Apply(static_cast(dev_ctx)); + Apply(static_cast(dev_ctx)); #else PADDLE_THROW( platform::errors::Unavailable("Paddle is not compiled with CUDA.")); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 41740923b420fb643cf98c32d9131286d741a6e5..073077f6586faed16832702b24218db0788cc30c 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -151,8 +151,7 @@ template class LookupTableGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index ef9bd7865d6f4b6628b1b4900f4bc3a68158049f..7b4ed84fc209b49ebd3562b5772426ce11c2425d 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -142,8 +142,7 @@ struct LookupTableV2GradCUDAFunctor { template void apply() { - auto &dev_ctx = - context_.template device_context(); + auto &dev_ctx = context_.template device_context(); bool is_sparse = context_.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu index c736dfb48a681c011b5f42739c59623b1c215a1d..8c95cf1d0c9dabfc88cdd2b301292abb4e6be49a 100644 --- a/paddle/fluid/operators/lrn_op.cu +++ b/paddle/fluid/operators/lrn_op.cu @@ -97,7 +97,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha, data_layout); @@ -108,7 +108,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, } template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, @@ -138,8 +138,8 @@ struct LRNFunctor { } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template __global__ void KeCMRNormDiff(int img_size, @@ -218,7 +218,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); KeCMRNormDiff <<>>(img_size, x, @@ -236,7 +236,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, } template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -268,13 +268,11 @@ struct LRNGradFunctor { } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lrn, ops::LRNKernel); -REGISTER_OP_CUDA_KERNEL( - lrn_grad, ops::LRNGradKernel); +REGISTER_OP_CUDA_KERNEL(lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL(lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc index 60364ef44869c2712aec0e671633f086662c34fc..13a0ded14b4ea1f7580ef42941d83401824d1bd5 100644 --- a/paddle/fluid/operators/lstm_op.cu.cc +++ b/paddle/fluid/operators/lstm_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstm, - ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CUDA_KERNEL( - lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CUDA_KERNEL(lstm, + ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL(lstm_grad, + ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu index 11c2844ccc3df50b688d13438160c2eb78372063..8614eaf5d4959580186c2ad289e33f3b5726d7a3 100644 --- a/paddle/fluid/operators/lstmp_op.cu +++ b/paddle/fluid/operators/lstmp_op.cu @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/lstmp_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstmp, - ops::LSTMPKernel, - ops::LSTMPKernel); -REGISTER_OP_CUDA_KERNEL( - lstmp_grad, - ops::LSTMPGradKernel, - ops::LSTMPGradKernel); +REGISTER_OP_CUDA_KERNEL(lstmp, + ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CUDA_KERNEL(lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index f063716b2000846f2085abe47f402807e0cad950..e9d1a6a136a3ce0026377efbb8277be84e4e9e7e 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -38,10 +38,8 @@ class LstsqCUDAKernel : public framework::OpKernel { auto* solution = context.Output("Solution"); auto dito = - math::DeviceIndependenceTensorOperations(context); - auto& dev_ctx = - context.template device_context(); + math::DeviceIndependenceTensorOperations(context); + auto& dev_ctx = context.template device_context(); auto x_dims = x.dims(); auto y_dims = y.dims(); @@ -163,20 +161,19 @@ class LstsqCUDAKernel : public framework::OpKernel { }; template <> -void BatchedOrmqr( - const platform::CUDADeviceContext& dev_ctx, - bool left, - bool transpose, - int batch_size, - int m, - int n, - int k, - float* a, - int a_stride, - float* tau, - int tau_stride, - float* other, - int other_stride) { +void BatchedOrmqr(const phi::GPUContext& dev_ctx, + bool left, + bool transpose, + int batch_size, + int m, + int n, + int k, + float* a, + int a_stride, + float* tau, + int tau_stride, + float* other, + int other_stride) { int lwork = 0; auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -232,20 +229,19 @@ void BatchedOrmqr( } template <> -void BatchedOrmqr( - const platform::CUDADeviceContext& dev_ctx, - bool left, - bool transpose, - int batch_size, - int m, - int n, - int k, - double* a, - int a_stride, - double* tau, - int tau_stride, - double* other, - int other_stride) { +void BatchedOrmqr(const phi::GPUContext& dev_ctx, + bool left, + bool transpose, + int batch_size, + int m, + int n, + int k, + double* a, + int a_stride, + double* tau, + int tau_stride, + double* other, + int other_stride) { int lwork = 0; auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -305,9 +301,8 @@ void BatchedOrmqr( namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstsq, - ops::LstsqCUDAKernel, - ops::LstsqCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstsq, + ops::LstsqCUDAKernel, + ops::LstsqCUDAKernel); #endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index b58142d793c6fd1273f998513d247e6e6fd3f5c5..6d1ff9f296eb85601c9bb9eb2d956986f48d5d8c 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -87,7 +87,7 @@ void GetClassInterval(const gpuStream_t& stream, const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); // use global calculate stream const auto calcu_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -275,7 +275,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { const float scale = ctx.Attr("scale"); const auto& place = ctx.GetPlace(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLComm* comm; @@ -290,7 +290,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { comm = platform::NCCLCommContext::Instance().Get(rid, place); // use global calculate stream - stream = static_cast( + stream = static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); } @@ -377,8 +377,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 2, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* logits_max_buff = logits_max.mutable_data(place); TensorReduceImpl>( dev_ctx, @@ -420,8 +419,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 4, sum(exp(logit - logit_max)) Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); TensorReduceImpl>( dev_ctx, @@ -465,7 +463,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - // logit_max)))) // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) - phi::funcs::SetConstant()( + phi::funcs::SetConstant()( dev_ctx, loss, static_cast(0.0)); if (label_type == framework::proto::VarType::INT32) { typedef int32_t LabelT; @@ -543,8 +541,7 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { const float margin3 = context.Attr("margin3"); const float scale = context.Attr("scale"); - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); const auto sofrmax_dims = softmax->dims(); const int axis = sofrmax_dims.size() - 1; diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu index d7e77e923029ea65bcf7bdb40f98b693107ba3a9..f672381ed7a4135e98b6bc413ea68e4de119792b 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cu +++ b/paddle/fluid/operators/margin_rank_loss_op.cu @@ -16,9 +16,7 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_CUDA_KERNEL( - margin_rank_loss_grad, - ops::MarginRankLossGradKernel); +REGISTER_OP_CUDA_KERNEL(margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL(margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu index 2c58b99396ea0049c5b8ad7b9df93297a946747a..3b52788514b915d41ccec72dd6fca9c43cfd3b55 100644 --- a/paddle/fluid/operators/marker_op.cu +++ b/paddle/fluid/operators/marker_op.cu @@ -33,7 +33,7 @@ template class MarkerOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto marker_role = ctx.Attr("marker_role"); auto marker_pos = ctx.Attr("marker_pos"); diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 2008e6b3fa2f3a3fb42aeabc063b6899d98087b6..80af6f673c40f65aa76d5484e1ae27e2c9ebe5e6 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -403,9 +403,9 @@ static inline int GetNumUsedThreads(const int max_threads_per_seq, } template -class BeamSearchFunctor { +class BeamSearchFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::LoDTensor* pre_ids, const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids, @@ -531,10 +531,10 @@ class BeamSearchFunctor { } }; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index f6b0349f1ca28124ecd5b6c3e0b894f027d7ab97..87785bfdc85b9bba05345453746bf032fbb661b8 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -144,15 +144,14 @@ void TestBeamSearch() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void TestBeamSearch() { +void TestBeamSearch() { paddle::framework::LoDTensor ids; paddle::framework::LoDTensor scores; paddle::framework::LoDTensor pre_ids; paddle::framework::LoDTensor pre_scores; auto* place = new paddle::platform::CUDAPlace(); - auto* context = new paddle::platform::CUDADeviceContext(*place); + auto* context = new phi::GPUContext(*place); context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -185,9 +184,7 @@ void TestBeamSearch - beamsearch; + paddle::operators::math::BeamSearchFunctor beamsearch; beamsearch(*context, &pre_ids, &pre_scores, @@ -235,7 +232,6 @@ TEST(BeamSearch, CPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(BeamSearch, GPU) { - TestBeamSearch(); + TestBeamSearch(); } #endif diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index b8c23cafe6dedb1c6716eb9c89017ce81018f721..42a54195defd7e076215ec03b0400facabb6d2d3 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -532,7 +532,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(half2 *qk_buf_, } template -inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, +inline void MatMulWithHeadQK(const phi::GPUContext &context, int head_num, int seq_len, int size_per_head, @@ -549,8 +549,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, CBLAS_TRANSPOSE transB = !k_trans ? CblasNoTrans : CblasTrans; typedef typename CUDATypeTraits::TYPE run_type; - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); auto stream = context.stream(); blas.BatchedGEMM(transA, @@ -625,7 +624,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, } template -inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, +inline void MatMulWithHeadQKV(const phi::GPUContext &context, int head_num, int seq_len, int size_per_head, @@ -641,8 +640,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, int k = head_num * size_per_head; typedef typename CUDATypeTraits::TYPE run_type; - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); auto stream = context.stream(); CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans; @@ -663,17 +661,16 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, } template -void MultiHeadGPUComputeFunctor::operator()( - const platform::CUDADeviceContext &dev_ctx, - int batch, - int seq_len, - int head_num, - int head_size, - T *qkptr, - const T *bias_qk_ptr, - T *tptr, - T alpha, - T beta) { +void MultiHeadGPUComputeFunctor::operator()(const phi::GPUContext &dev_ctx, + int batch, + int seq_len, + int head_num, + int head_size, + T *qkptr, + const T *bias_qk_ptr, + T *tptr, + T alpha, + T beta) { auto stream = dev_ctx.stream(); const int tsize = batch * head_num * seq_len * head_size; diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 55d3dd2c3e80de677caa83c2ae472c432043567c..bc59e2fa1a38b44330324d4b90c98ce5930c8e61 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -93,7 +93,7 @@ class EmbEltwiseLayerNormFunctor { template class MultiHeadGPUComputeFunctor { public: - void operator()(const platform::CUDADeviceContext &dev_ctx, + void operator()(const phi::GPUContext &dev_ctx, int batch, int seq_len, int head_num, diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 298c2f4e5efc6c900f9c8729a3364abc558dff3a..11508fd2d1eae50c1da8fd723350e74ea9a188fb 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -23,9 +23,9 @@ namespace math { * each dimension must be the same, except the axis dimension. */ template -class ConcatFunctor { +class ConcatFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::vector& input, int axis, framework::Tensor* output) { @@ -39,9 +39,9 @@ class ConcatFunctor { * each dimension must be the same, except the axis dimension. */ template -class SplitFunctor { +class SplitFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const std::vector& ref_inputs, int axis, @@ -51,9 +51,9 @@ class SplitFunctor { } }; -#define DEFINE_FUNCTOR(type) \ - template class ConcatFunctor; \ - template class SplitFunctor +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor FOR_ALL_TYPES(DEFINE_FUNCTOR); diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 4f0fee91e5919ad924299c75190ee523247f585b..ccbe1c2aeed00999e3ab8bae085aaacc3574d92f 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -469,24 +469,18 @@ void TestConcatMain() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void TestConcatMain() { - auto* context = - new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); +void TestConcatMain() { + auto* context = new phi::GPUContext(paddle::platform::CUDAPlace()); context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPlace(), context->stream()) .get()); context->PartialInitWithAllocator(); - ConcatCase1( - context); - ConcatCase2( - context); - ConcatCase3( - context); - ConcatCase4( - context); + ConcatCase1(context); + ConcatCase2(context); + ConcatCase3(context); + ConcatCase4(context); delete context; } @@ -495,7 +489,6 @@ void TestConcatMain(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - TestConcatMain(); + TestConcatMain(); #endif } diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu index f04b2d15349be329ee228fc8903c9b38a5349634..70b3d67caf3d41607d51d52ca0af713f0e4672d9 100644 --- a/paddle/fluid/operators/math/context_project.cu +++ b/paddle/fluid/operators/math/context_project.cu @@ -17,8 +17,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index 34aeabfac647056efd99b02b2da4279e60f7cc0b..cbe76844519a159f680cd508502815da14f2a94e 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -50,8 +50,8 @@ __global__ void CosSimDyKernel(const T* x_norm, } template -struct CosSimDyFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct CosSimDyFunctor { + void operator()(const phi::GPUContext& ctx, const T* x_norm, const T* y_norm, const T* x, @@ -69,8 +69,8 @@ struct CosSimDyFunctor { } }; -template struct CosSimDyFunctor; -template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index a056341c3bf3ce44f8282da9fb9dd1971ac5492c..61682a95c132955df12b094ad08bdd85ffa44780 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -176,7 +176,7 @@ struct MatrixEighFunctor { // symmetric matrices on GPU, and uses the variable has_vectors // to control whether to return the eigenvectors. template -struct MatrixEighFunctor { +struct MatrixEighFunctor { public: void operator()(const framework::ExecutionContext &ctx, const Tensor &input, @@ -187,10 +187,9 @@ struct MatrixEighFunctor { using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto dito = - math::DeviceIndependenceTensorOperations(ctx); + math::DeviceIndependenceTensorOperations(ctx); Tensor input_trans; input_trans = dito.Transpose(input); auto *input_vector = input_trans.data(); @@ -324,34 +323,34 @@ struct MatrixEighFunctor { m(paddle::platform::complex, Che, cuComplex) \ m(paddle::platform::complex, Zhe, cuDoubleComplex) -#define EVDBUFFER_INSTANCE(T, C, CastType) \ - template <> \ - inline void MatrixEighFunctor::EvdBuffer( \ - cusolverDnHandle_t handle, \ - cusolverEigMode_t jobz, \ - cublasFillMode_t uplo, \ - int n, \ - const T *A, \ - int lda, \ - const ValueType *W, \ - int *lwork) const { \ - PADDLE_ENFORCE_GPU_SUCCESS( \ - platform::dynload::cusolverDn##C##evd_bufferSize( \ - handle, \ - jobz, \ - uplo, \ - n, \ - reinterpret_cast(A), \ - lda, \ - W, \ - lwork)); \ +#define EVDBUFFER_INSTANCE(T, C, CastType) \ + template <> \ + inline void MatrixEighFunctor::EvdBuffer( \ + cusolverDnHandle_t handle, \ + cusolverEigMode_t jobz, \ + cublasFillMode_t uplo, \ + int n, \ + const T *A, \ + int lda, \ + const ValueType *W, \ + int *lwork) const { \ + PADDLE_ENFORCE_GPU_SUCCESS( \ + platform::dynload::cusolverDn##C##evd_bufferSize( \ + handle, \ + jobz, \ + uplo, \ + n, \ + reinterpret_cast(A), \ + lda, \ + W, \ + lwork)); \ } FUNC_WITH_TYPES(EVDBUFFER_INSTANCE); #define EVD_INSTANCE(T, C, CastType) \ template <> \ - inline void MatrixEighFunctor::Evd( \ + inline void MatrixEighFunctor::Evd( \ cusolverDnHandle_t handle, \ cusolverEigMode_t jobz, \ cublasFillMode_t uplo, \ diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu index 44ce4f0d6d35f3a1411a09384a96937a0f3d40ee..49aae2ebc1da88c35959c740cf3c39628ffb0ef1 100644 --- a/paddle/fluid/operators/math/gru_compute.cu +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::CUDADeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -94,7 +94,7 @@ struct GRUUnitFunctor { threads = dim3(32, 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -180,8 +180,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::CUDADeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -230,7 +230,7 @@ struct GRUUnitGradFunctor { origin_mode); } - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, @@ -324,10 +324,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 93ee9d3a15bade59fbd07964178eaa6cbf81364e..09ec777ebb633a3a02804833ad3eb7d0b53c7b28 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -179,8 +179,7 @@ void testIm2col() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void testIm2col() { +void testIm2col() { paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; paddle::framework::Tensor output_cfo; @@ -222,7 +221,7 @@ void testIm2colSetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -240,12 +239,12 @@ void testIm2col im2col; paddle::operators::math::Im2ColFunctor< paddle::operators::math::ColFormat::kOCF, - paddle::platform::CUDADeviceContext, + phi::GPUContext, float> im2col_ocf; @@ -283,12 +282,12 @@ void testIm2col col2im; paddle::operators::math::Col2ImFunctor< paddle::operators::math::ColFormat::kOCF, - paddle::platform::CUDADeviceContext, + phi::GPUContext, float> col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -343,8 +342,7 @@ void testIm2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 716989a78698efcbc25f1022876a2fbe33c49a4e..f18053e297e55fe67e592c990fdc77d11b84fef4 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -124,15 +124,14 @@ int UniqSampler(const Sampler& sampler, } template -void GPUSampleWithProb::operator()( - const platform::CUDADeviceContext& context, - const int seed, - const int dict_size, - const bool uniq, - const std::size_t num_samples, - const Tensor* L, - Tensor* S, - Tensor* P) { +void GPUSampleWithProb::operator()(const phi::GPUContext& context, + const int seed, + const int dict_size, + const bool uniq, + const std::size_t num_samples, + const Tensor* L, + Tensor* S, + Tensor* P) { // UNDERSTAND: dimension issues const auto lbl_dim = L->dims(); const int batch_size = lbl_dim[0]; diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index bb5c2ef9799e8c90ec5af675360c25b748885241..1e8fb983a94992abe9f8ead34b38f9b78294c19a 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -111,7 +111,7 @@ class SampleWithProb { template class GPUSampleWithProb { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const int seed, const int dict_size, const bool uniq, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index f09578a0b1c60def98bb651bff5d9bf2fdd3aba6..7fa9dc27db9cd920e53dee8863e4730fb79bbc7f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -25,8 +25,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::CUDADeviceContext& context, +struct SelectedRowsAdd { + void operator()(const phi::GPUContext& context, const phi::SelectedRows& input1, const phi::SelectedRows& input2, phi::SelectedRows* output) { @@ -109,8 +109,8 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; namespace { template @@ -210,8 +210,8 @@ template struct SelectedRowsAdd; template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::CUDADeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const phi::GPUContext& context, const phi::SelectedRows& input1, const int64_t input2_offset, phi::SelectedRows* input2) { @@ -259,12 +259,11 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; namespace { template @@ -588,14 +587,14 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, } template -struct UpdateToTensor { - void operator()(const platform::CUDADeviceContext& context, +struct UpdateToTensor { + void operator()(const phi::GPUContext& context, const ScatterOps& op, const phi::SelectedRows& input1, framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. - MergeAdd merge_func; + MergeAdd merge_func; auto merged_in1 = merge_func(context, input1); auto in1_height = merged_in1.height(); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 877c3c63affedcd7858995809d0071e9e087c8d6..746a64ff58cde6eb1f0bd63dab9ab04242c5549a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -20,10 +20,9 @@ limitations under the License. */ TEST(selected_rows_functor, gpu_add) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant functor; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -62,9 +61,7 @@ TEST(selected_rows_functor, gpu_add) { // simply concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), gpu_place); - paddle::operators::math::SelectedRowsAdd - add_functor; + paddle::operators::math::SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -108,9 +105,8 @@ TEST(selected_rows_functor, gpu_add) { new paddle::framework::Tensor()}; tensor2->mutable_data(phi::make_ddim({height, row_numel}), gpu_place); - paddle::operators::math:: - SelectedRowsAddTensor - add_tensor_functor; + paddle::operators::math::SelectedRowsAddTensor + add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); paddle::framework::Tensor tensor2_cpu; @@ -137,10 +133,9 @@ TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add_to) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant functor; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -169,9 +164,8 @@ TEST(selected_rows_functor, gpu_add_to) { // simply concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), gpu_place); - paddle::operators::math:: - SelectedRowsAddTo - add_to_functor; + paddle::operators::math::SelectedRowsAddTo + add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -212,9 +206,8 @@ TEST(selected_rows_functor, gpu_add_to) { tensor1->mutable_data(phi::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - paddle::operators::math:: - SelectedRowsAddToTensor - add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); paddle::framework::Tensor tensor1_cpu; @@ -241,10 +234,9 @@ TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_merge_add) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant set_const; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant set_const; int64_t height = 10; int64_t row_numel = 8; @@ -269,9 +261,8 @@ TEST(selected_rows_functor, gpu_merge_add) { std::unique_ptr output{new phi::SelectedRows()}; output->set_height(height); - paddle::operators::math::scatter:: - MergeAdd - merge_add_functor; + paddle::operators::math::scatter::MergeAdd + merge_add_functor; std::vector inputs; inputs.push_back(selected_rows1.get()); diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index 06eca480ec622e4bcbe87eacd3ddb8e000fc0a32..84944270f453af74ff10445e8e2107830352438f 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -116,17 +116,15 @@ TEST(Seq2BatchPadding, CPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(SequencePadding, CUDA) { auto place = paddle::platform::CUDAPlace(0); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePadding( - *context, lod1, 16); + TestSequencePadding(*context, lod1, 16); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePadding( - *context, lod2, 128); + TestSequencePadding(*context, lod2, 128); } #endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 9ee3b107bea4215e57ae594dcff5bb3b5c1e2395..a5edb1db95c3fce8c8aea6e526ef221f4f1b0375 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -189,9 +189,9 @@ __global__ void sequence_pool_kernel(Range_OP op, } template -class SequencePoolFunctor { +class SequencePoolFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::string pooltype, T pad_value, const framework::LoDTensor& input, @@ -408,9 +408,9 @@ __global__ void sequence_pool_grad_kernel(Range_OP op, } template -class SequencePoolGradFunctor { +class SequencePoolGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::string pooltype, const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, @@ -493,10 +493,10 @@ class SequencePoolGradFunctor { }; // sequence pooling -template class SequencePoolFunctor; -template class SequencePoolFunctor; -template class SequencePoolGradFunctor; -template class SequencePoolGradFunctor; +template class SequencePoolFunctor; +template class SequencePoolFunctor; +template class SequencePoolGradFunctor; +template class SequencePoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 63d922b7ebb80ff2c21159f9d71ab9294f300f88..9cff64f75607b760e9c8613b04a12948d3b8bb6e 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -132,17 +132,15 @@ TEST(SequencePoolingGrad, CPU_SUM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(SequencePoolingGrad, CUDA_SUM) { auto place = paddle::platform::CUDAPlace(0); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum( - *context, lod1, 128); + TestSequencePoolingSum(*context, lod1, 128); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum( - *context, lod2, 128); + TestSequencePoolingSum(*context, lod2, 128); } #endif diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu index fd501d5188d5f9a79c52ff08b06c61eb23e41b06..3aceceac32de21ad9f67cb4c10b38cad41345c32 100644 --- a/paddle/fluid/operators/math/tree2col.cu +++ b/paddle/fluid/operators/math/tree2col.cu @@ -51,9 +51,9 @@ __global__ void tree2col(const T* eta, } } template -class Tree2ColFunctor { +class Tree2ColFunctor { public: - void operator()(const paddle::platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& EdgeSet, const framework::Tensor& node_features, framework::Tensor* patch, @@ -63,7 +63,7 @@ class Tree2ColFunctor { auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto feature_dims = node_features.dims(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; Tensor EdgeSet_cpu; framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu); @@ -128,9 +128,9 @@ class Tree2ColFunctor { } }; template -class Col2TreeFunctor { +class Col2TreeFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& EdgeSet, const framework::Tensor& patch_grad, framework::Tensor* embedding_grad, @@ -140,7 +140,7 @@ class Col2TreeFunctor { auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto output_dims = patch_grad.dims(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; Tensor EdgeSet_cpu; framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu); @@ -214,10 +214,10 @@ class Col2TreeFunctor { } }; -template class Tree2ColFunctor; -template class Tree2ColFunctor; -template class Col2TreeFunctor; -template class Col2TreeFunctor; +template class Tree2ColFunctor; +template class Tree2ColFunctor; +template class Col2TreeFunctor; +template class Col2TreeFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index 5a7764331997bffe122738ffa82e61619f4e90ff..253f4cb0279380277cffb2fbd348604eaff1c7dd 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -111,9 +111,9 @@ __global__ void KernelUnpool3dMaxGrad(const int nthreads, */ template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -148,9 +148,9 @@ class Unpool2dMaxFunctor { * All tensors are in NCHW format. */ template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -189,9 +189,9 @@ class Unpool2dMaxGradFunctor { }; template -class Unpool3dMaxFunctor { +class Unpool3dMaxFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -230,9 +230,9 @@ class Unpool3dMaxFunctor { * All tensors are in NCDHW format. */ template -class Unpool3dMaxGradFunctor { +class Unpool3dMaxGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -274,14 +274,14 @@ class Unpool3dMaxGradFunctor { } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxFunctor; -template class Unpool3dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxFunctor; +template class Unpool3dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index ec3926b95ee8768eed0c061c830082f26c786a9c..c0c4ed5bb5d699e750ff048b80d977a7bcd1ff28 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -132,15 +132,14 @@ void testVol2col() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void testVol2col() { +void testVol2col() { paddle::framework::Tensor input; paddle::framework::Tensor input_tmp; paddle::framework::Tensor output; paddle::framework::Tensor output_tmp; auto* place = new paddle::platform::CUDAPlace(); - auto* context = new paddle::platform::CUDADeviceContext(*place); + auto* context = new phi::GPUContext(*place); context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -202,9 +201,7 @@ void testVol2col - vol2col; + paddle::operators::math::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -230,9 +227,7 @@ void testVol2col - col2vol; + paddle::operators::math::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -256,7 +251,6 @@ void testVol2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - testVol2col(); + testVol2col(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index c79073861ab6e01beb52f703e3ecfcbec1681b18..ff7ab502e8efefeb3235977fddc59430f1e456a7 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -1055,20 +1055,17 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( matmul, - ops::MatMulKernel, - ops::MatMulKernel, - ops::MatMulKernel); + ops::MatMulKernel, + ops::MatMulKernel, + ops::MatMulKernel); REGISTER_OP_CUDA_KERNEL( matmul_grad, - ops::MatMulGradKernel, - ops::MatMulGradKernel, - ops::MatMulGradKernel); -REGISTER_OP_CUDA_KERNEL( - matmul_grad_grad, - ops::MatMulDoubleGradKernel, - ops::MatMulDoubleGradKernel); + ops::MatMulGradKernel, + ops::MatMulGradKernel, + ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL(matmul_grad_grad, + ops::MatMulDoubleGradKernel, + ops::MatMulDoubleGradKernel); #endif REGISTER_OP_VERSION(matmul).AddCheckpoint( diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 1359bd62b494133e3cc3af046c5e343aa31c0c9d..08ab074718b9163aba7bb5193c163783830d1af4 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -92,7 +92,7 @@ template class MeanIoUCUDAOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto& place = *dev_ctx.eigen_device(); // get input and output tensor auto* predictions = ctx.Input("Predictions"); diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 8cd84f4b59e8ceae1e65db38e202a98ef2635920..b0513b0af8470f2d40fca0d3aaac4da7d9385714 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -40,8 +40,7 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto stream = - static_cast(&dev_ctx_)->stream(); + auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; #endif diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc index 90d5fb3eaeb1f155eeea29ea0cf3f5ecd610f5f0..16b9b5dc6bdf13443dfbf8528c8f34391c6fe8aa 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc +++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - merge_selected_rows, - ops::MergeSelectedRowsKernel, - ops::MergeSelectedRowsKernel); +REGISTER_OP_CUDA_KERNEL(merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 1e369c81538ed2cb74059c214fc3e5b975452bdb..310d28738fc756ad252305e018021d1ef38d5c57 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -155,5 +155,4 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel); -REGISTER_OP_CUDA_KERNEL( - minus, ops::MinusKernel); +REGISTER_OP_CUDA_KERNEL(minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu index 9450b72c95f19563f352ebf33c18ed40d01b8346..67c3a5d90da9aec7f33f0f11f43a5bcc39bf81f7 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cu +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -76,8 +76,7 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - modified_huber_loss, - ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss, + ops::ModifiedHuberLossKernel); REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc index b74c1fca088db45162dcd83b391b3ecdcaff79a3..01ca5d43090454b2392a10a627632a17e09941d2 100644 --- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc @@ -58,7 +58,7 @@ class NCCLTester : public ::testing::Test { paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list_.size(); ++i) { p::CUDAPlace place(i); - auto *ctx = new p::CUDADeviceContext(place); + auto *ctx = new phi::GPUContext(place); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); @@ -184,7 +184,7 @@ void NCCLTester::testNcclAllReduceOp() { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - auto *dev_ctx = static_cast(dev_ctxs_[i]); + auto *dev_ctx = static_cast(dev_ctxs_[i]); paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[i]), @@ -296,7 +296,7 @@ void NCCLTester::testNcclBcastOp() { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - auto *dev_ctx = static_cast(dev_ctxs_[idx]); + auto *dev_ctx = static_cast(dev_ctxs_[idx]); paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index 64f5bc9eab42a6ca842c195f18c2dc085b87ec65..330163b1f9350ba483f2b093a03ab03a8b70f068 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -92,8 +92,7 @@ class NumberCountOpCUDAKernel : public framework::OpKernel { int64_t batch_size = numbers->numel(); auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::DDim out_dims = phi::make_ddim({upper_range}); auto out_data = number_count->mutable_data(out_dims, place); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 6dfc4a7d13cadff447e661714b5a44ef2dc44e9d..85594ff05742e4dfab1f437ca76e04ad9b9a88cb 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -99,7 +99,6 @@ class OneHotCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot, - ops::OneHotCUDAKernel, - ops::OneHotCUDAKernel); +REGISTER_OP_CUDA_KERNEL(one_hot, + ops::OneHotCUDAKernel, + ops::OneHotCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index 3314e899a13d26f082f01a104f706f32b635e5b8..205eb2853a3419fd6ae2816f39fc1001d7e94895 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -31,7 +31,7 @@ struct CastFunctor { }; template -static void VecCastKernel(const platform::CUDADeviceContext &ctx, +static void VecCastKernel(const phi::GPUContext &ctx, const InT *x, OutT *y, size_t n) { @@ -53,7 +53,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, } // namespace details template -static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, +static void LaunchCastKernel(const phi::GPUContext &ctx, const InT *x, OutT *y, size_t n) { diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu index dc568802a2b19fee5c8d7fd8d07c929cba8ab4e3..30825a6a329d3b2f5c412d45ce9772c54f70e520 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu @@ -14,6 +14,5 @@ limitations under the License. */ #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - decayed_adagrad, - ops::DecayedAdagradOpKernel); +REGISTER_OP_CUDA_KERNEL(decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu index e7fdeb617de9ec6c4a3c3487900db5ba0dbd3c70..7909d58a6441663c804b317de92296f553e22ebc 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu @@ -15,6 +15,5 @@ #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc_momentum, - ops::DGCMomentumKernel); +REGISTER_OP_CUDA_KERNEL(dgc_momentum, + ops::DGCMomentumKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 7b1397b7df663000f10ad6b67eef0c065edfd23d..e7d795ccc579c86d9dbb2b6dc898cc49af8dba98 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -178,22 +178,21 @@ static size_t FillAlignmentPaddingInfo(std::vector *infos, } template -static T *TensorFillConstant(const platform::CUDADeviceContext &dev_ctx, +static T *TensorFillConstant(const phi::GPUContext &dev_ctx, framework::Tensor *tensor, const framework::DDim &dims, T value) { tensor->Resize(dims); auto *ptr = tensor->mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; set_constant(dev_ctx, tensor, value); return ptr; } -static framework::Tensor CastDataForInitedTensor( - const platform::CUDADeviceContext &dev_ctx, - framework::Tensor *origin, - framework::Tensor *fused_out, - size_t numel_offset) { +static framework::Tensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx, + framework::Tensor *origin, + framework::Tensor *fused_out, + size_t numel_offset) { PADDLE_ENFORCE_EQ(origin->IsInitialized(), true, platform::errors::InvalidArgument( @@ -338,12 +337,12 @@ static T ClipByBound(T x, T low_value, T high_value) { } template -class DistributedFusedLambInitOpKernel +class DistributedFusedLambInitOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { VLOG(10) << "starts to run DistributedFusedLambInitOp"; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto place = ctx.GetPlace(); auto stream = dev_ctx.stream(); @@ -790,4 +789,4 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_fused_lamb_init, - ops::DistributedFusedLambInitOpKernel); + ops::DistributedFusedLambInitOpKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index f8d55ff9cf72a46000bfe00e88b6163365757d0f..394e49dd529e0ae523d56c62e7040d807adae5c7 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -242,8 +242,7 @@ static void LogParamAndTrustRatioDivSquareNorm( } } -static bool IsFinite(const platform::CUDADeviceContext &dev_ctx, - const float *ptr) { +static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { auto stream = dev_ctx.stream(); float cpu_value; #ifdef PADDLE_WITH_HIP @@ -509,7 +508,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( template static void MultiTensorUpdateLambMomentAndTrustRatioDiv( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const int *offsets, int n, const T *param_p, @@ -779,7 +778,7 @@ template static void MultiTensorUpdateLambParamAndBetaPows( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const int *offsets, int n, const MasterT *trust_ratio_div, @@ -898,7 +897,7 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, } template -static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, +static void LaunchScaleKernel(const phi::GPUContext &dev_ctx, const T1 *x, const T2 *scale, T1 *y, @@ -925,7 +924,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff, size_t nranks, ncclComm_t comm, gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const T *scale = nullptr) { static_assert(std::is_same::value || std::is_same::value, @@ -974,15 +973,14 @@ static void NCCLSumWithScaleBase(const T *sendbuff, } template -static void NCCLReduceScatterWithScale( - const T *sendbuff, - T *recvbuff, - size_t recvcount, - size_t nranks, - ncclComm_t comm, - gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, - const T *scale = nullptr) { +static void NCCLReduceScatterWithScale(const T *sendbuff, + T *recvbuff, + size_t recvcount, + size_t nranks, + ncclComm_t comm, + gpuStream_t stream, + const phi::GPUContext &dev_ctx, + const T *scale = nullptr) { NCCLSumWithScaleBase( sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); } @@ -994,7 +992,7 @@ static void NCCLAllReduceWithScale(const T *sendbuff, size_t nranks, ncclComm_t comm, gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const T *scale = nullptr) { NCCLSumWithScaleBase( sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); @@ -1104,7 +1102,7 @@ static std::string GetMinMaxStr(const T *x, true, platform::errors::InvalidArgument("Only support CUDAPlace currently.")); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); @@ -1276,13 +1274,12 @@ static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x, } template -static void LaunchElementwiseAddWithCastKernel( - const platform::CUDADeviceContext &dev_ctx, - const T1 *x, - const T2 *y, - T3 *z, - int n, - gpuStream_t stream) { +static void LaunchElementwiseAddWithCastKernel(const phi::GPUContext &dev_ctx, + const T1 *x, + const T2 *y, + T3 *z, + int n, + gpuStream_t stream) { int vec_size = std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)), GetChunkedVecSize(z, 0)); @@ -1300,12 +1297,12 @@ static void LaunchElementwiseAddWithCastKernel( } template -class DistributedFusedLambOpKernel +class DistributedFusedLambOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto place = dev_ctx.GetPlace(); @@ -2135,4 +2132,4 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( distributed_fused_lamb, - ops::DistributedFusedLambOpKernel); + ops::DistributedFusedLambOpKernel); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu index acf8e38ca0f5a3cf9899f4898898013e8a2afdd2..dbea7e4d51cb8d3f1be47e0167be205d9d370f66 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cu +++ b/paddle/fluid/operators/optimizers/ftrl_op.cu @@ -13,5 +13,4 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/ftrl_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - ftrl, ops::FTRLOpKernel); +REGISTER_OP_CUDA_KERNEL(ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu index a9f880fdbb67d380f4975174eafbb2f951cb3a4c..0d60979eef0bd7820aa21c7d9dc7a2e49cf90091 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cu +++ b/paddle/fluid/operators/optimizers/lamb_op.cu @@ -17,7 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lamb, - ops::LambOpKernel, - ops::LambOpKernel, - ops::LambOpKernel); + ops::LambOpKernel, + ops::LambOpKernel, + ops::LambOpKernel); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index d9aef74931ac1d50fdf1b26745edafc7b399310d..5337e56b28d5b86392ebc892f0c5698a52119a3e 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -419,25 +419,24 @@ __global__ void MomentumLarsKernel(const T* param, } template -inline void SeparatedLarsMomentumOpCUDAKernel( - const platform::CUDADeviceContext& cuda_ctx, - const T* param_data, - T* param_out_data, - const MT* velocity_data, - MT* velocity_out_data, - const T* grad_data, - const MT* lr, - MT* p_buffer, - MT* g_buffer, - const MT mu, - const MT lars_coeff, - const MT weight_decay, - const MT epsilon, - const MT rescale_grad, - const int64_t numel, - const MT* master_param_data, - MT* master_out_data, - const bool is_amp) { +inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx, + const T* param_data, + T* param_out_data, + const MT* velocity_data, + MT* velocity_out_data, + const T* grad_data, + const MT* lr, + MT* p_buffer, + MT* g_buffer, + const MT mu, + const MT lars_coeff, + const MT weight_decay, + const MT epsilon, + const MT rescale_grad, + const int64_t numel, + const MT* master_param_data, + MT* master_out_data, + const bool is_amp) { LarsThreadConfig lars_thread_config(numel); L2NormKernel<< { void Compute(const framework::ExecutionContext& ctx) const override { int num_blocks_per_sm = 0; bool multi_precision = ctx.Attr("multi_precision"); - auto& cuda_ctx = ctx.template device_context(); + auto& cuda_ctx = ctx.template device_context(); int sm_num = cuda_ctx.GetSMCount(); - framework::Tensor tmp_buffer_t = - ctx.AllocateTmpTensor( - {LARS_BLOCK_SIZE << 1}, cuda_ctx); + framework::Tensor tmp_buffer_t = ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; @@ -684,7 +682,6 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lars_momentum, - ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel); + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu index be3f6d6c91a98b5d502e9b276e708272aa21e729..6419e524f71523fd32cebb2f1e908e67d195c775 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu @@ -20,5 +20,5 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( pow2_decay_with_linear_warmup, - ops::Pow2DecayWithLinearWarmupOpKernel, - ops::Pow2DecayWithLinearWarmupOpKernel); + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu index 591dead3b12763e4cd1b9c390a87816ab121fbf8..c338f4cc717a5747c76ad2336628b9835ad97058 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu @@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - proximal_adagrad, - ops::ProximalAdagradOpKernel); +REGISTER_OP_CUDA_KERNEL(proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu index d556fa74f19529d0e2f80d4c6dbfca62498c9dcc..edc911134c7293f34c16900337568af48877ff88 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu @@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/proximal_gd_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - proximal_gd, - ops::ProximalGDOpKernel); +REGISTER_OP_CUDA_KERNEL(proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 6fd49248db1eea8c67ac24c3ef10b6f57671d84a..28ca7c6d8d3b797e66d8420d54a642ad2af66c17 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -65,8 +65,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, } // namespace template -class SGDOpKernel - : public framework::OpKernel { +class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* param_var = ctx.InputVar("Param"); diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu index cbafefb34fdf290257922a7a33ebd979846b6dc3..d8f8e9749b8be85ed02b4bf719d48230f3f971ec 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu @@ -19,7 +19,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sparse_momentum, - ops::SparseMomentumOpKernel, - ops::SparseMomentumOpKernel, - ops::SparseMomentumOpKernel); + ops::SparseMomentumOpKernel, + ops::SparseMomentumOpKernel, + ops::SparseMomentumOpKernel); diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index 25dae1ec7f34b6757a24cda586e3b7817fcdd329..5ed217b2e60ef5b408bccad076cb11844f46aac7 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -508,8 +508,8 @@ class Pad2dGradCUDAKernel : public framework::OpKernel { const T* d_out_data = d_out->data(); T* d_in_data = d_in->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), + phi::funcs::SetConstant set_zero; + set_zero(context.template device_context(), d_in, static_cast(0)); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index e523c93f5d10bcfaf38ea1b7a3e84eca61a9a647..254e8ebe5c5701ecf29de5e3eff33e69349747d1 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -259,17 +259,14 @@ REGISTER_OP_CPU_KERNEL( ops::PadConstantLikeGradKernel, ops::PadConstantLikeGradKernel); -REGISTER_OP_CUDA_KERNEL( - pad_constant_like, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel); +REGISTER_OP_CUDA_KERNEL(pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); REGISTER_OP_CUDA_KERNEL( pad_constant_like_grad, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel); + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu index 7e365dbeb1dacb9bc0d876261baa642512034a64..f4d8f7083b0073ed4fc3b35c6f189ce8eafc5501 100644 --- a/paddle/fluid/operators/partial_concat_op.cu +++ b/paddle/fluid/operators/partial_concat_op.cu @@ -101,7 +101,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel { int all_length = batch_size * out_batch_len; constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; @@ -171,8 +171,8 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { auto grad_batch_len = partial_len * in_num; auto all_length = grad_batch_len * batch_size; // initialize - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); @@ -180,7 +180,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { } constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu index a8d0b145082b3c23b0dd92e6bde2e27945fad1a2..69517233bf3be65ddc5c8624ba49d76175068c3e 100644 --- a/paddle/fluid/operators/partial_sum_op.cu +++ b/paddle/fluid/operators/partial_sum_op.cu @@ -94,7 +94,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel { } constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; @@ -163,8 +163,8 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { } // initialize - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); @@ -180,7 +180,7 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { auto out_num = outs.size(); constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu index 04249d3779489db4384e7ac2377a2931c235777c..ac4666bb17471100f180e80acf2af669dd5b914b 100644 --- a/paddle/fluid/operators/prroi_pool_op.cu +++ b/paddle/fluid/operators/prroi_pool_op.cu @@ -426,7 +426,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(prroi_pool, ops::GPUPRROIPoolOpKernel, ops::GPUPRROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - prroi_pool_grad, - ops::GPUPRROIPoolGradOpKernel, - ops::GPUPRROIPoolGradOpKernel); +REGISTER_OP_CUDA_KERNEL(prroi_pool_grad, + ops::GPUPRROIPoolGradOpKernel, + ops::GPUPRROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 85d57974ede7ae788ac0846998edd1457d3fcb9d..3b626cd762eb75b7ce3cd5db165a5f95a9636c11 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -130,4 +130,4 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel { REGISTER_OP_CUDA_KERNEL( prune_gate_by_capacity, - ops::PruneGateByCapacityCUDAKernel); + ops::PruneGateByCapacityCUDAKernel); diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc index c8342e6d5d11b9ab6de9e4845c3433ee7f08512f..6fe0156c01aa12c90821fef2ceaad61335f67e17 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc @@ -19,4 +19,4 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_lookup_table, - ops::DistributedLookupTableKernel); + ops::DistributedLookupTableKernel); diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc index 5c4ae3bdcfef30b4450d809568af6a1d7f3f67a7..bba442a630abffc1f9aa69d7f248008b4b048241 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc @@ -19,5 +19,5 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_push_sparse, - ops::DistributedPushSparseKernel, - ops::DistributedPushSparseKernel); + ops::DistributedPushSparseKernel, + ops::DistributedPushSparseKernel); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 73eb3f1509223bc8b1de01a8c128faae736f669f..d3f1d17e7a3f18d3352d0ab1786fb196ee5c5e01 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -98,12 +98,11 @@ class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker); -REGISTER_OP_CUDA_KERNEL( - send_and_recv, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel); +REGISTER_OP_CUDA_KERNEL(send_and_recv, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel); REGISTER_OP_CPU_KERNEL(send_and_recv, ops::SendAndRecvKernel, ops::SendAndRecvKernel, diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 8d0d2d3090c1746b2fd0f1753902f67504190cc9..9aef7051fa5b5261004cd6365b84ebd69d487bad 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -100,8 +100,7 @@ void InitTensorsOnClient(framework::Scope* scope, // ids_var->mutable_data(framework::DDim({rows_numel, 1}), // *place); // for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); auto micro_id_var = scope->Var("microbatch_id")->GetMutable(); @@ -245,7 +244,7 @@ TEST(SENDANDRECV, GPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 9255a5f164bc4b5b47aa2a895103db19cdeca45b..9c13934ccd49af6277b51daf3c7f3b1c34cca5f5 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -223,21 +223,17 @@ REGISTER_OP_CPU_KERNEL( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( py_layer, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel>, - ops::PyLayerOpKernel>); + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel>, + ops::PyLayerOpKernel>); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 24ae989532dd8778162f174e9c6c95f286cbe672..8ae18a56329418a2e220b614c55ddbef38b8667f 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -36,8 +36,7 @@ class QrGPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { bool compute_q; bool reduced_mode; - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); const Tensor& x = *context.Input("X"); Tensor& q = *context.Output("Q"); Tensor& r = *context.Output("R"); @@ -69,8 +68,7 @@ class QrGPUKernel : public framework::OpKernel { size_t(batch_size * k * n * sizeof(phi::dtype::Real))); auto dito = - math::DeviceIndependenceTensorOperations(context); + math::DeviceIndependenceTensorOperations(context); // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr @@ -94,7 +92,7 @@ class QrGPUKernel : public framework::OpKernel { auto qr_data = qr.mutable_data(context.GetPlace()); auto tau_data = tau.mutable_data(context.GetPlace()); - BatchedGeqrf( + BatchedGeqrf( dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride); if (reduced_mode) { @@ -114,16 +112,16 @@ class QrGPUKernel : public framework::OpKernel { // Perform QRGQR for Q using the result from GEQRF // Transpose 'q' to retore the original row-major order if (reduced_mode) { - BatchedOrgqr(dev_ctx, - batch_size, - m, - min_mn, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + min_mn, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = dito.Transpose(qr); auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn}); framework::TensorCopy(sliced_q, q.place(), &q); @@ -142,29 +140,29 @@ class QrGPUKernel : public framework::OpKernel { qr_stride * sizeof(phi::dtype::Real), dev_ctx.stream()); } - BatchedOrgqr(dev_ctx, - batch_size, - m, - m, - min_mn, - new_qr_data, - m, - tau_data, - new_qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + m, + min_mn, + new_qr_data, + m, + tau_data, + new_qr_stride, + tau_stride); auto trans_q = dito.Transpose(new_qr); framework::TensorCopy(trans_q, q.place(), &q); } else { - BatchedOrgqr(dev_ctx, - batch_size, - m, - m, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + m, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = dito.Transpose(qr); auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m}); framework::TensorCopy(sliced_q, q.place(), &q); @@ -175,16 +173,15 @@ class QrGPUKernel : public framework::OpKernel { }; template <> -void BatchedGeqrf( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - float* a, - int lda, - float* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + float* a, + int lda, + float* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -227,16 +224,15 @@ void BatchedGeqrf( } template <> -void BatchedGeqrf( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - double* a, - int lda, - double* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + double* a, + int lda, + double* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -279,17 +275,16 @@ void BatchedGeqrf( } template <> -void BatchedOrgqr( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - float* a, - int lda, - float* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -333,17 +328,16 @@ void BatchedOrgqr( } template <> -void BatchedOrgqr( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - double* a, - int lda, - double* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -391,9 +385,8 @@ void BatchedOrgqr( namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel, ops::QrGPUKernel); -REGISTER_OP_CUDA_KERNEL( - qr_grad, - ops::QrGradKernel, - ops::QrGradKernel); +REGISTER_OP_CUDA_KERNEL(qr_grad, + ops::QrGradKernel, + ops::QrGradKernel); #endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index 93c688aa6428d5fbf118481f4d6904c93dd61451..37ca11db3e3e2ea9d8bbb873d781b0e765bc4dfe 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -24,8 +24,8 @@ namespace paddle { namespace operators { template -struct ChannelDequantizeFunctorV2 { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct ChannelDequantizeFunctorV2 { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, T max_range, @@ -61,14 +61,14 @@ struct ChannelDequantizeFunctorV2 { } }; -template struct ChannelDequantizeFunctorV2; -template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_linear, ops::DeQuantizeLinearKernel, ops::DeQuantizeLinearKernel, diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu index 55f7615d0f12acf50d1db160d2df767af76069e2..8a10b96a6f01061897433f4577aab076c94caf33 100644 --- a/paddle/fluid/operators/random_crop_op.cu +++ b/paddle/fluid/operators/random_crop_op.cu @@ -16,7 +16,7 @@ namespace ops = paddle::operators; template -using Kernel = ops::RandomCropKernel; +using Kernel = ops::RandomCropKernel; REGISTER_OP_CUDA_KERNEL(random_crop, Kernel, Kernel, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index aee430b50579d230394485d81450f8e60a99b137..253560d981d28330b974fb22df99469f078bcd27 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -39,7 +39,7 @@ struct Random { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -struct Random { +struct Random { using Engine = thrust::minstd_rand; template diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu index 61e38fb00fc7244c7651cf5f1bfebfe160e88009..0b8aaf2d97078adde3da271b0847771e52553a44 100644 --- a/paddle/fluid/operators/random_routing_op.cu +++ b/paddle/fluid/operators/random_routing_op.cu @@ -60,8 +60,7 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel { auto out = context.Output("Out"); auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::TensorCopy(*topk_idx, place, out); size_t N = topk_idx->dims()[0]; diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 7571fcae27071b8819ff84dc721e521cd3908ab3..83f6f23f98506328af49c97e9e3cee0b5f59f672 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -62,7 +62,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel { int block_matrix_row = max_rank * x_fea_dim; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); int max_ins = std::max(ins_num, max_size); @@ -83,8 +83,8 @@ class RankAttentionCUDAKernel : public framework::OpKernel { auto ins_rank_eigen = framework::EigenVector::Flatten(*ins_rank); auto out_eigen = framework::EigenVector::Flatten(*Out); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); param_help_eigen.device(place) = param_help_eigen.constant(static_cast(0)); @@ -135,7 +135,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel { int64_t strideA = block_matrix_row; int64_t strideB = block_matrix_row * para_col; - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.BatchedGEMM(transA, transB, 1, @@ -176,9 +176,9 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { auto rank_offset_dims = rank_offset->dims(); auto max_rank = (rank_offset_dims[1] - 1) / 2; int block_matrix_row = max_rank * x_fea_dim; - auto &dev_ctx = ctx.template device_context(); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &dev_ctx = ctx.template device_context(); + auto &place = + *ctx.template device_context().eigen_device(); int max_ins = std::max(ins_num, max_size); // initialize out grad @@ -201,7 +201,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { const T *ins_rank_data = ins_rank->data(); T *param_grad_data = param_grad.data(); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); T alpha = 1; T beta = 0; @@ -242,7 +242,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(rank_attention, ops::RankAttentionCUDAKernel, ops::RankAttentionCUDAKernel); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index edf82d00950ae298ee106a599b9dc18217b55b6e..b353b2992ce1915c35964e0fdb90a649c3ee4ecd 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -245,10 +245,7 @@ REGISTER_OP_CPU_KERNEL(rank_loss_grad, ops::RankLossGradKernel); REGISTER_OP_CUDA_KERNEL( - rank_loss, - paddle::operators::RankLossKernel); + rank_loss, paddle::operators::RankLossKernel); REGISTER_OP_CUDA_KERNEL( rank_loss_grad, - paddle::operators::RankLossGradKernel); + paddle::operators::RankLossGradKernel); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index a36d51e42f5c8bbee728e83ec012c64eb2da11cc..b9c608b62e7db55bb3b41bb6e3f12a78d58fc445 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -52,8 +52,8 @@ BufferedReader::BufferedReader( if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; compute_stream_ = - ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() - .Get(place_))) + ((phi::GPUContext *)(platform::DeviceContextPool::Instance().Get( + place_))) ->stream(); events_.resize(buffer_size); for (auto &event : events_) { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 95bb061077114da6098136eaf31d8be6606142e1..d7f153700cfa2ff11918fc88831a4866be8de7bd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -31,7 +31,7 @@ template class ReduceOp, typename TransformOp> -void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, +void TensorReduceImpl(const phi::GPUContext& dev_ctx, const framework::Tensor& x, framework::Tensor* y, const TransformOp& transform, diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index e022e128c7fe101ecf7f37208e504695b17dba16..ea21b985e7f7b0bf067bc448c382cf3cf9a13fbe 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -161,8 +161,7 @@ class CUDARenormKernel : public framework::OpKernel { std::vector ins = {x}; std::vector outs = {&pow_value}; auto func = UnsignedPowFunctor(p); - const auto& cuda_ctx = - context.template device_context(); + const auto& cuda_ctx = context.template device_context(); paddle::operators::LaunchSameDimsElementwiseCudaKernel( cuda_ctx, ins, &outs, func); diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu index 3371134f3443b6b9a97f7a0e03265a0766b9b677..07099c30271ecb5b997830f009716dd68c09d2cb 100644 --- a/paddle/fluid/operators/repeat_interleave_op.cu +++ b/paddle/fluid/operators/repeat_interleave_op.cu @@ -88,8 +88,7 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel { auto stride_dim = phi::stride(input_dim); int64_t stride = stride_dim[dim]; - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); int repeats = context.Attr("Repeats"); framework::LoDTensor index; @@ -218,8 +217,7 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { int64_t numel = in_grad->numel(); int64_t out_nums = output_grad->numel(); - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); index_select_grad_init <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, @@ -328,23 +326,16 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( repeat_interleave, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel); + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel); REGISTER_OP_CUDA_KERNEL( repeat_interleave_grad, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel); + ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index b665cce096207daf540af0bcfb3ef4019463cadd..6a25e2c7902871445e789f12516fab28a0440a80 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -428,7 +428,7 @@ class ReshapeKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeKernel(static_cast(dev_ctx), *in, pt_scalar_shape, @@ -461,7 +461,7 @@ class ReshapeGradKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } @@ -491,7 +491,7 @@ class ReshapeDoubleGradKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 633811862d84c2fb5e530e03e1bbdff5890946ee..f69889f7f8f25fb034ea2e28e363bb4d9fb5e086 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -323,8 +323,7 @@ __global__ void RowConvGradFilter(const T *in, } // namespace template -class RowConvKernel - : public framework::OpKernel { +class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -378,8 +377,7 @@ class RowConvKernel }; template -class RowConvGradKernel - : public framework::OpKernel { +class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -418,7 +416,7 @@ class RowConvGradKernel size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace()); auto &device_ctx = context.cuda_device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant zero; if (dFilter) { T *dfilter = dFilter->mutable_data(context.GetPlace()); @@ -494,8 +492,6 @@ class RowConvGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - row_conv, ops::RowConvKernel); -REGISTER_OP_CUDA_KERNEL( - row_conv_grad, - ops::RowConvGradKernel); +REGISTER_OP_CUDA_KERNEL(row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL(row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/run_program_op.cu.cc b/paddle/fluid/operators/run_program_op.cu.cc index 19cd354c18f3a01863d0ea6269d9fed343e78d1d..b3383434203e1e79d1f21c31d13a8e5e665ca94f 100644 --- a/paddle/fluid/operators/run_program_op.cu.cc +++ b/paddle/fluid/operators/run_program_op.cu.cc @@ -20,9 +20,7 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; /* see [Why use single type kernel] */ -REGISTER_OP_CUDA_KERNEL( - run_program, - ops::RunProgramOpKernel); -REGISTER_OP_CUDA_KERNEL( - run_program_grad, - ops::RunProgramGradOpKernel); +REGISTER_OP_CUDA_KERNEL(run_program, + ops::RunProgramOpKernel); +REGISTER_OP_CUDA_KERNEL(run_program_grad, + ops::RunProgramGradOpKernel); diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu index 1fb9942b37ad41c2030096add64285ea3dec09a7..d0d8af95a3f725dfc540680d1433b3767dcb25d8 100644 --- a/paddle/fluid/operators/sample_logits_op.cu +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -145,7 +145,7 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { // UNDERSTAND: allocate memories for temporaries sampled_logits->mutable_data(samples_dim, context.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, sampled_logits, static_cast(0)); auto sampled_labels_data = @@ -244,7 +244,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel { logits_grad->mutable_data(context.GetPlace()); auto& dev_ctx = context.cuda_device_context(); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, logits_grad, static_cast(0)); // UNDERSTAND: scatter it back to logit_grad diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu index 71476fd802bdd9e617e69d422b1e931178f05322..e96aafa38297823d9446d527b888e059b63ca1e6 100644 --- a/paddle/fluid/operators/save_combine_op.cu +++ b/paddle/fluid/operators/save_combine_op.cu @@ -16,9 +16,8 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - save_combine, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel); +REGISTER_OP_CUDA_KERNEL(save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu index 056894dbae1562821c5b566f20f6d90bfa5b94af..03753b6e7e3a18a5f6b00c14db4cf40d6768f722 100644 --- a/paddle/fluid/operators/save_op.cu +++ b/paddle/fluid/operators/save_op.cu @@ -19,11 +19,10 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( save, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel); + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 9b1d7a27e58e42af50524c231cdf5a008584a3f1..be406db50569d30bad205856888a6a7977fef00c 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -53,6 +53,5 @@ class GPUSeedKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - seed, - paddle::operators::GPUSeedKernel); +REGISTER_OP_CUDA_KERNEL(seed, + paddle::operators::GPUSeedKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index f2117a2f098335625e9149c3cd8c8b3f3f0643f9..2374ec02e8fdf1293fa7203b1bc0c73ab13c7ec0 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -18,21 +18,13 @@ REGISTER_OP_CUDA_KERNEL( sequence_concat, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel); + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel); REGISTER_OP_CUDA_KERNEL( sequence_concat_grad, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel); + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc index 600981b5e96c279329a67b608a8dd94dee7d88ef..5939ede964cc3de7a1f4dd16e90b7a9c2a4a2578 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_conv, - ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); +REGISTER_OP_CUDA_KERNEL(sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL(sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index 363c40ce26d837501742f2e96fca311b56376564..cacd777f17e453c50ecd063a2ef1d00e052a05ca 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -66,9 +66,9 @@ static __global__ void sequence_expand_as_grad_kernel( } template -struct SequenceExpandAsFunctor { +struct SequenceExpandAsFunctor { void operator()( - const platform::CUDADeviceContext &context, + const phi::GPUContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ LoDTensor *out) { @@ -97,8 +97,8 @@ struct SequenceExpandAsFunctor { }; template -struct SequenceExpandAsGradFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceExpandAsGradFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &dout, const framework::Vector &ref_lod, /*expand based lod*/ LoDTensor *dx) { @@ -133,17 +133,14 @@ struct SequenceExpandAsGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_expand_as, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel); +REGISTER_OP_CUDA_KERNEL(sequence_expand_as, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel); REGISTER_OP_CUDA_KERNEL( sequence_expand_as_grad, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel); + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index 5ba02527825c62c0621dbd0b9a07d4fa3cc79ce5..f6e082f4d2acee6d31100bfbd92732b4ed000bf4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -96,7 +96,7 @@ void GetOutputOffset(const framework::Vector& x_lod, } template -static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, +static int ExpandByMemoryCopy(const phi::GPUContext& context, const LoDTensor& x, LoDTensor* out, const framework::Vector& x_lod, @@ -142,9 +142,9 @@ static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, } template -struct SequenceExpandFunctor { +struct SequenceExpandFunctor { void operator()( - const platform::CUDADeviceContext& context, + const phi::GPUContext& context, const LoDTensor& x, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ @@ -194,8 +194,8 @@ struct SequenceExpandFunctor { }; template -struct SequenceExpandGradFunctor { - void operator()(const platform::CUDADeviceContext& context, +struct SequenceExpandGradFunctor { + void operator()(const phi::GPUContext& context, const LoDTensor& dout, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand based lod*/ @@ -228,16 +228,14 @@ struct SequenceExpandGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_expand, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel); +REGISTER_OP_CUDA_KERNEL(sequence_expand, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel); REGISTER_OP_CUDA_KERNEL( sequence_expand_grad, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel); + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu index e963ce610e2c147d66087a1df59f67a04d899ccc..b4284d2717a044d94404257b306bc98d5e52c951 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu @@ -16,11 +16,7 @@ REGISTER_OP_CUDA_KERNEL( sequence_mask, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel); + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu index 7fc64a530ef5442ae927faac96ad92a4126febcd..84a3e8da141e583f81908068a1f167094e5f4335 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu @@ -15,15 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_pad, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_pad_grad, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu index 4897474a485d8417854ffb53aa8ee64321c78ae7..882ec66f501db0036ba5d2d26bb5e5b0dd9e7dff 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_pool, - ops::SequencePoolKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_pool_grad, - ops::SequencePoolGradKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu index 38bc599165d5f84f67e2fe08bf96ebef4b03d8a4..eaf34643a07671bb78d5ef6d90a088342dad12cd 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_reshape, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel); +REGISTER_OP_CUDA_KERNEL(sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); REGISTER_OP_CUDA_KERNEL( sequence_reshape_grad, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel); + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu index 0a59ed7f9fee07bc3b12909973535f31ef049a4a..810130669b508b99559ba0df9dc4688ff045bd32 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu @@ -16,10 +16,9 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_reverse, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_reverse, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu index a4b0ea2e5b2f4972a3004e427bcae96e87a5a9e9..ecf39a07309b0c17490a0d7554b5faf9fb288634 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_slice, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_slice, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel); REGISTER_OP_CUDA_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc index 58e99364f4f6b88ba548354fcb051d8fc74cbac9..b060aa9f08b15b22334c3b3b9319f387c72ff37e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc @@ -61,10 +61,8 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { phi::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxCUDNNFunctor()( - ctx.template device_context(), - &x_i, - &out_i); + math::SoftmaxCUDNNFunctor()( + ctx.template device_context(), &x_i, &out_i); } } }; @@ -97,8 +95,8 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradCUDNNFunctor()( - ctx.template device_context(), + math::SoftmaxGradCUDNNFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, &x_grad_i); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index bb0ad26b51bb4c3079459161b1a560bcf0512b86..5417c20f3d4196caaeeceb3841bf0fe2e1c40fc2 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -40,8 +40,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { bool runtime_cudnn_support = false; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } #endif @@ -149,8 +148,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { bool runtime_cudnn_support = false; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 696f6e7ca88a8b232ac15b4bc06aaa7bfbead2d3..360f9055519221303aafa730efdde8cab5dd2553 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -121,8 +121,8 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data, } template -struct SequenceSoftmaxFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceSoftmaxFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*referenced lod*/ LoDTensor *out) { @@ -146,8 +146,8 @@ struct SequenceSoftmaxFunctor { }; template -struct SequenceSoftmaxGradFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceSoftmaxGradFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &dout, const LoDTensor &out, const framework::Vector &ref_lod, /*referenced lod*/ @@ -177,12 +177,10 @@ struct SequenceSoftmaxGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_softmax, - ops::SequenceSoftmaxKernel, - ops::SequenceSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL(sequence_softmax, + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel); REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu index bf54f77f5b55cf7eb19873e352359c028207308a..4124e17cb09a8e89b6f099ef439f5f7b76cf7193 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_unpad, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); REGISTER_OP_CUDA_KERNEL( sequence_unpad_grad, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel); + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu index 7803f407181d00532ebbf4e3078ba562186aabb0..6b70b8d37d79c9320dedbb05974aaa8b36c7495a 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cu +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -88,7 +88,7 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel { auto *shuffleidx_data = shuffleidx->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); #ifdef PADDLE_WITH_CUDA const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); #else @@ -106,8 +106,8 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel { auto *out_data = out->mutable_data(ctx.GetPlace()); ReorderFunctor functor( x_data, shuffleidx_data, out_data, x_embed_size); - platform::ForRange for_range( - dev_ctx, elem_size * x_embed_size); + platform::ForRange for_range(dev_ctx, + elem_size * x_embed_size); for_range(functor); auto *seed_out_data = seed_out->mutable_data(phi::make_ddim({1}), @@ -136,10 +136,9 @@ class ShuffleBatchGradCUDAKernel : public framework::OpKernel { auto x_embed_size = x_grad->dims()[x_grad->dims().size() - 1]; ReorderFunctor functor( out_grad_data, shuffleidx_data, x_grad_data, x_embed_size); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // TODO(zengjinle): for small data, direct cudaMemcpy may be better - platform::ForRange for_range(dev_ctx, - x_grad->numel()); + platform::ForRange for_range(dev_ctx, x_grad->numel()); for_range(functor); #endif } diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 02c0cfdd9693cec01acead3f94717a630fd61f56..f51724d843107a33c142c5443aa18d5aa9450f49 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -129,12 +129,9 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( shuffle_channel, - ops::ShuffleChannelOpCUDAKernel, - ops::ShuffleChannelOpCUDAKernel); + ops::ShuffleChannelOpCUDAKernel, + ops::ShuffleChannelOpCUDAKernel); REGISTER_OP_CUDA_KERNEL( shuffle_channel_grad, - ops::ShuffleChannelGradOpCUDAKernel, - ops::ShuffleChannelGradOpCUDAKernel); + ops::ShuffleChannelGradOpCUDAKernel, + ops::ShuffleChannelGradOpCUDAKernel); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 4e81226188304d1aa63c545d92a24acd46e19d00..f42ebbe0399eb0632e0cbf4d7653f8f180efbd77 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -488,32 +488,24 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CUDA_KERNEL( slice, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel>, - ops::SliceKernel>); + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel>, + ops::SliceKernel>); REGISTER_OP_CUDA_KERNEL( slice_grad, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel>, - ops::SliceGradKernel>); + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel>, + ops::SliceGradKernel>); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu index e5df479090fabe926f65f58e2300e3ee2027e54d..d57b96d0ec5b790708cd62bec592dc0c8f7f8293 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cu +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - smooth_l1_loss, - ops::SmoothL1LossKernel); -REGISTER_OP_CUDA_KERNEL( - smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); +REGISTER_OP_CUDA_KERNEL(smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL(smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu index 5a9f7c288d12c571df02828f9ad5a07563f127b4..f9df5a5f74b6fe952c6ed2de7199c8970babab2c 100644 --- a/paddle/fluid/operators/space_to_depth_op.cu +++ b/paddle/fluid/operators/space_to_depth_op.cu @@ -17,16 +17,14 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - space_to_depth, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel); +REGISTER_OP_CUDA_KERNEL(space_to_depth, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel); -REGISTER_OP_CUDA_KERNEL( - space_to_depth_grad, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel); +REGISTER_OP_CUDA_KERNEL(space_to_depth_grad, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel); diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu index 423ec7271087fced758c383a7790e8ce6cf5395f..8bf431e59f017ca9f8c5ef2f426b3ef3bc4779d7 100644 --- a/paddle/fluid/operators/sparse_attention_op.cu +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -209,7 +209,7 @@ input: sparse C in CSR format (num_rows,num_rows) output: sparse C after softmax operation */ template -void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, +void SparseSoftmaxForward(const phi::GPUContext& ctx, const Tensor* offset, const Tensor* columns, Tensor* input, @@ -322,7 +322,7 @@ void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, } template -void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, +void SparseSoftmaxBackward(const phi::GPUContext& ctx, const Tensor* offset, const Tensor* columns, Tensor* dx, @@ -453,7 +453,7 @@ input: dense A (num_rows,num_cols), dense B (num_rows,num_cols) output: sparse C in CSR format (num_rows,num_rows) */ template -void DotSdd(const platform::CUDADeviceContext& ctx, +void DotSdd(const phi::GPUContext& ctx, const Tensor* a, const Tensor* b, const Tensor* c_offset, @@ -546,7 +546,7 @@ input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols) output: dense C (num_rows,num_cols) */ template -void DotDsd(const platform::CUDADeviceContext& ctx, +void DotDsd(const phi::GPUContext& ctx, const Tensor* a_offset, const Tensor* a_columns, const Tensor* a_value, @@ -881,10 +881,10 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel { } // namespace paddle REGISTER_OP_CUDA_KERNEL( sparse_attention, - ops::SparseAttentionCUDAKernel, - ops::SparseAttentionCUDAKernel); + ops::SparseAttentionCUDAKernel, + ops::SparseAttentionCUDAKernel); REGISTER_OP_CUDA_KERNEL( sparse_attention_grad, - ops::SparseAttentionGradCUDAKernel, - ops::SparseAttentionGradCUDAKernel); + ops::SparseAttentionGradCUDAKernel, + ops::SparseAttentionGradCUDAKernel); diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index f11347269987000786ee170ecf315b4e5b103a51..661fcc83771f54fd3290499584d5896e76ba8c5b 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -13,32 +13,26 @@ #include "paddle/fluid/operators/spectral_op.cu.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2c, + ops::FFTC2CKernel, + ops::FFTC2CKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2c_grad, + ops::FFTC2CGradKernel, + ops::FFTC2CGradKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2r, + ops::FFTC2RKernel, + ops::FFTC2RKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2r_grad, + ops::FFTC2RGradKernel, + ops::FFTC2RGradKernel); -REGISTER_OP_CUDA_KERNEL( - fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); +REGISTER_OP_CUDA_KERNEL(fft_r2c, + ops::FFTR2CKernel, + ops::FFTR2CKernel); -REGISTER_OP_CUDA_KERNEL( - fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_r2c_grad, + ops::FFTR2CGradKernel, + ops::FFTR2CGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h index d7911d8ef1891766be7c581031254beb09cb89aa..5ced67691ee0764d38a5457ea261e1f44b48c830 100644 --- a/paddle/fluid/operators/spectral_op.cu.h +++ b/paddle/fluid/operators/spectral_op.cu.h @@ -907,8 +907,8 @@ static bool use_optimized_fft_path(const std::vector& axes) { } template -struct FFTC2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTC2CFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -934,7 +934,7 @@ struct FFTC2CFunctor { std::min(static_cast(kMaxFFTNdim), working_axes.size()); first_dims.assign(working_axes.end() - max_dims, working_axes.end()); - exec_fft( + exec_fft( ctx, p_working_tensor, p_out, first_dims, forward); working_axes.resize(working_axes.size() - max_dims); first_dims.clear(); @@ -945,14 +945,14 @@ struct FFTC2CFunctor { std::swap(p_out, p_working_tensor); } - exec_normalization( + exec_normalization( ctx, p_out, out, normalization, out_dims, axes); } }; template -struct FFTC2RFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTC2RFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -965,28 +965,27 @@ struct FFTC2RFunctor { framework::Tensor x_copy(X->type()); x_copy.mutable_data(X->dims(), ctx.GetPlace()); framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); - exec_fft( - ctx, &x_copy, out, axes, forward); + exec_fft(ctx, &x_copy, out, axes, forward); } else { framework::Tensor temp_tensor; temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); const std::vector dims(axes.begin(), axes.end() - 1); - FFTC2CFunctor c2c_functor; + FFTC2CFunctor c2c_functor; c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); - exec_fft( + exec_fft( ctx, &temp_tensor, out, {axes.back()}, forward); } - exec_normalization( + exec_normalization( ctx, out, out, normalization, out_dims, axes); } }; // n dimension real to complex FFT use cufft lib template -struct FFTR2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTR2CFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -996,22 +995,21 @@ struct FFTR2CFunctor { framework::Tensor* r2c_out = out; const std::vector last_dim{axes.back()}; std::vector out_dims = phi::vectorize(out->dims()); - exec_fft( - ctx, X, r2c_out, last_dim, forward); + exec_fft(ctx, X, r2c_out, last_dim, forward); // Step2: C2C transform on the remaining dimension framework::Tensor c2c_out; if (axes.size() > 1) { c2c_out.mutable_data(out->dims(), ctx.GetPlace()); std::vector remain_dim(axes.begin(), axes.end() - 1); - FFTC2CFunctor fft_c2c_func; + FFTC2CFunctor fft_c2c_func; fft_c2c_func( ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward); } const auto in_sizes = phi::vectorize(X->dims()); framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; - exec_normalization( + exec_normalization( ctx, norm_tensor, out, normalization, in_sizes, axes); } }; diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc index f18efe4a0357ef7e0f305c77a4697064b331ae07..24f4d65f6617e65e7a21ddd6537aaaa2eaf4d655 100644 --- a/paddle/fluid/operators/spp_op.cu.cc +++ b/paddle/fluid/operators/spp_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/spp_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - spp, - ops::SppKernel, - ops::SppKernel); -REGISTER_OP_CUDA_KERNEL( - spp_grad, - ops::SppGradKernel, - ops::SppGradKernel); +REGISTER_OP_CUDA_KERNEL(spp, + ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CUDA_KERNEL(spp_grad, + ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu index 9cef47bd07ec1145d9742961b856e762568ac6a5..c10cbfb42f1352f2a44c9dd812542acd98ae94bd 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.cu +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -14,10 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - squared_l2_distance, - ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance, + ops::SquaredL2DistanceKernel); REGISTER_OP_CUDA_KERNEL( squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceGradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc index c7a96d03173388970763d80812bbf11d07afa635..a77b369c403732680ca268ff7124ff33f3126abd 100644 --- a/paddle/fluid/operators/squeeze_op.cu.cc +++ b/paddle/fluid/operators/squeeze_op.cu.cc @@ -19,31 +19,27 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel>, - ops::SqueezeKernel>); + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel>, + ops::SqueezeKernel>); REGISTER_OP_CUDA_KERNEL( squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel>); + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel>, + ops::SqueezeGradKernel>); diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu index 7bc3396064cb4af01b5fd66b6545e4b8a5b4c74d..9edee0f66c51428ab2481e132338a24fbed916f8 100644 --- a/paddle/fluid/operators/stft_op.cu +++ b/paddle/fluid/operators/stft_op.cu @@ -17,12 +17,10 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - stft, - ops::StftKernel, - ops::StftKernel); +REGISTER_OP_CUDA_KERNEL(stft, + ops::StftKernel, + ops::StftKernel); -REGISTER_OP_CUDA_KERNEL( - stft_grad, - ops::StftGradKernel, - ops::StftGradKernel); +REGISTER_OP_CUDA_KERNEL(stft_grad, + ops::StftGradKernel, + ops::StftGradKernel); diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index d65fc9ea808f6e4f8c85b0beeb4b072fd7c9152b..350c3820a38c244ba48e2a140b160c8b63d10c0d 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -110,8 +110,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(ctx); + auto& cuda_ctx = reinterpret_cast(ctx); memory::Copy(gpu_place, dst + i * dst_after, gpu_place, diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc index e16df345427958349f59b431b7f0e8b21924dd4c..3d8902a68acfdbe203fa33850f61e9df0b44ff69 100644 --- a/paddle/fluid/operators/strided_memcpy_test.cc +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -86,7 +86,7 @@ TEST(StridedMemcpy, GPUCrop) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; - platform::CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -128,7 +128,7 @@ TEST(StridedMemcpy, GPUConcat) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; - platform::CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 7b307413cd32070faa0dfe40314b5b4b4a08663b..2cc17de1820eb4a59426a8cfc6adbd1e83147eef 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -77,8 +77,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { const size_t in_num = in_vars.size(); constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); @@ -138,11 +137,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { int start = in_place ? 1 : 0; if (!in_place) { - phi::funcs::SetConstant constant_functor; - constant_functor( - context.template device_context(), - out, - static_cast(0)); + phi::funcs::SetConstant constant_functor; + constant_functor(context.template device_context(), + out, + static_cast(0)); } std::vector in_data; @@ -243,8 +241,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } template -class SumKernel - : public framework::OpKernel { +class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto out_var = context.OutputVar("Out"); @@ -252,9 +249,9 @@ class SumKernel if (out_var->IsType()) { SumToLoDTensor(context); } else if (out_var->IsType()) { - SelectedRowsCompute(context); + SelectedRowsCompute(context); } else if (out_var->IsType()) { - LodTensorArrayCompute(context); + LodTensorArrayCompute(context); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Expected type of Output(out) must be Tensor, SelectedRows or " @@ -269,11 +266,10 @@ class SumKernel namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - sum, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CUDA_KERNEL(sum, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h index f531df936cd90183dda930d30de91f46dd00c3cb..ef8a041fc5adcf74839b8769da09771ff68e0608 100644 --- a/paddle/fluid/operators/tensor_to_string.h +++ b/paddle/fluid/operators/tensor_to_string.h @@ -38,7 +38,7 @@ static std::vector ToVector(const T *x, using CopyT = typename std:: conditional::value, uint8_t, T>::type; std::vector cpu_x(n); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); memory::Copy(platform::CPUPlace(), cpu_x.data(), diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 1cd2683796acd2781a714418d63001846cf18715..b13996b6fab78a58ee60bba736db32e5dfa193c2 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -471,8 +471,7 @@ class TensorRTEngineOp : public framework::OperatorBase { int runtime_batch = -1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); + auto stream = reinterpret_cast(dev_ctx).stream(); std::vector output_maps = Attr>("output_name_mapping"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 8e2b162babce9c2e3de9167b923ab226623579cd..33ebaff8eabad46b765f9a186257b7cc172a26a3 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope, auto dims = phi::make_ddim(shape); tensor->Resize(dims); platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); @@ -142,7 +142,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); @@ -171,7 +171,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h index 6df883e83337fd2cd0a0595453617ff7bb395b92..1162bf21592d598549b2db9c3e2b642df36592cc 100644 --- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h @@ -96,10 +96,9 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim, #if defined(__NVCC__) || defined(__HIPCC__) if (platform::is_gpu_place(place)) { - auto &cuda_dev_ctx = dynamic_cast(dev_ctx); + auto &cuda_dev_ctx = dynamic_cast(dev_ctx); functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx); - platform::ForRange for_range(cuda_dev_ctx, - limit); + platform::ForRange for_range(cuda_dev_ctx, limit); for_range(actual_functor); } else { #endif diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 390ed2b2ff329ebe1c80cd905f6ea9317cf262a7..4a038c93a1f49654d09273348d583224ffae3aa2 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -902,7 +902,7 @@ __global__ void AssignGradWithAxis(const T* grad_out, } // use the radix sort for the topk template -bool SortTopk(const platform::CUDADeviceContext& ctx, +bool SortTopk(const phi::GPUContext& ctx, const framework::Tensor* input_tensor, const int64_t num_cols, const int64_t num_rows, diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 4910d1cf2594a4a62226cc05554337485d84e993..79236f590f7dc4802b46f3a0a6f81083a61ab7a4 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -157,26 +157,18 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { } // namespace paddle REGISTER_OP_CUDA_KERNEL( top_k, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); REGISTER_OP_CUDA_KERNEL( top_k_grad, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel); diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu index 17d52cea1e0a02850bd3c2107f4e357ca0a7112f..1e4ca7bb838cef4ce90cda0d3afc067af62a56f9 100644 --- a/paddle/fluid/operators/tree_conv_op.cu +++ b/paddle/fluid/operators/tree_conv_op.cu @@ -15,11 +15,9 @@ #include "paddle/fluid/operators/tree_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - tree_conv, - ops::TreeConvKernel, - ops::TreeConvKernel); -REGISTER_OP_CUDA_KERNEL( - tree_conv_grad, - ops::TreeConvGradKernel, - ops::TreeConvGradKernel); +REGISTER_OP_CUDA_KERNEL(tree_conv, + ops::TreeConvKernel, + ops::TreeConvKernel); +REGISTER_OP_CUDA_KERNEL(tree_conv_grad, + ops::TreeConvGradKernel, + ops::TreeConvGradKernel); diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu index febb093ed70afda7d8128b1af9bceffd0cae2742..a34909374102118027ad1205256cee558b0a74ee 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op.cu +++ b/paddle/fluid/operators/uniform_random_inplace_op.cu @@ -32,12 +32,11 @@ class GPUUniformRandomInplaceGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* dx = ctx.Output(framework::GradVarName("X")); auto dims = vectorize(dx->dims()); - const auto& dev_cxt = - ctx.template device_context(); + const auto& dev_cxt = ctx.template device_context(); float value = static_cast(0.0f); phi::FullKernel( static_cast::TYPE&>(dev_cxt), + phi::GPUContext>::TYPE&>(dev_cxt), dims, value, phi::DataType::UNDEFINED, diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 0614e0920dfac8ac3f9696319d2b8ae9f724996b..9f0f93f5573f5e132094dee236ea90d420e130df 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -150,8 +150,7 @@ template void UniformRandom(const framework::ExecutionContext& context, framework::Tensor* tensor) { int64_t size = tensor->numel(); - auto& dev_cxt = - context.template device_context(); + auto& dev_cxt = context.template device_context(); T* data = tensor->mutable_data(dev_cxt.GetPlace()); if (size <= 0) return; unsigned int seed = static_cast(context.Attr("seed")); diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc index 71c056580331b4dbc8a1ba39f77d21a12efcf751..82890419dafa5012334dc21e35dd1eb037d2a7a5 100644 --- a/paddle/fluid/operators/unpool_op.cu.cc +++ b/paddle/fluid/operators/unpool_op.cu.cc @@ -15,19 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CUDA_KERNEL( - unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); -REGISTER_OP_CUDA_KERNEL( - unpool3d, - ops::Unpool3dKernel, - ops::Unpool3dKernel); -REGISTER_OP_CUDA_KERNEL( - unpool3d_grad, - ops::Unpool3dGradKernel, - ops::Unpool3dGradKernel); +REGISTER_OP_CUDA_KERNEL(unpool, + ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL(unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); +REGISTER_OP_CUDA_KERNEL(unpool3d, + ops::Unpool3dKernel, + ops::Unpool3dKernel); +REGISTER_OP_CUDA_KERNEL(unpool3d_grad, + ops::Unpool3dGradKernel, + ops::Unpool3dGradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc index 598595ff28bf3de2773a69de5b6c78b21f704218..3a98a64d858a5de0cfac14385eb82e6e944226f0 100644 --- a/paddle/fluid/operators/unsqueeze_op.cu.cc +++ b/paddle/fluid/operators/unsqueeze_op.cu.cc @@ -19,35 +19,30 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( unsqueeze, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel>); + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel>, + ops::UnsqueezeKernel>); REGISTER_OP_CUDA_KERNEL( unsqueeze_grad, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel>, + ops::UnsqueezeGradKernel>); diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu index cd6c3a22e0359f0ca7c200c93ab281fb8944739c..3ec89214a3865c607acd30732ec635cf82a41fa7 100644 --- a/paddle/fluid/platform/bfloat16_test.cu +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -67,7 +67,7 @@ TEST(bfloat16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); - CUDADeviceContext gpu_ctx(gpu_place); + phi::GPUContext gpu_ctx(gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 00b5dd7f8afe90e6549e829fc662b9fe2edc70b7..2589aa9acd0d20ed18f390574b654ae3678a6c6a 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -41,10 +41,10 @@ class NCCLCommImpl : public NCCLComm { gpuStream_t stream() const override { return dev_ctx_->stream(); } - void set_dev_ctx(std::unique_ptr&& dev_ctx) { + void set_dev_ctx(std::unique_ptr&& dev_ctx) { dev_ctx_ = std::move(dev_ctx); } - CUDADeviceContext* dev_context() const override { return dev_ctx_.get(); } + phi::GPUContext* dev_context() const override { return dev_ctx_.get(); } gpuEvent_t compute_event() const override { return compute_event_.get(); } @@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm { int nranks_; int rank_; ncclComm_t comm_; - std::unique_ptr dev_ctx_; + std::unique_ptr dev_ctx_; // used for comm wait compute, compute_stream-->event-->comm_stream std::shared_ptr compute_event_; @@ -203,8 +203,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( NCCLComm* NCCLCommContext::AssignNCCLComm( ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { - std::unique_ptr dev_ctx( - new CUDADeviceContext(CUDAPlace(dev_id))); + std::unique_ptr dev_ctx( + new phi::GPUContext(CUDAPlace(dev_id))); dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream()) .get()); @@ -246,7 +246,7 @@ NCCLComm* NCCLCommContext::AssignNCCLComm( comm_map_mutex_.unlock(); if (ring_id == 0) { - auto* dev_ctx = static_cast( + auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(dev_id))); dev_ctx->set_nccl_comm(comm); diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 37065960828d63b53d2dc637828f18e4833107a7..207496d9f46d823559c1db8389cfa392a4d1579a 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -62,7 +62,7 @@ class NCCLComm { virtual gpuStream_t stream() const = 0; virtual gpuEvent_t compute_event() const = 0; virtual gpuEvent_t comm_event() const = 0; - virtual CUDADeviceContext* dev_context() const = 0; + virtual phi::GPUContext* dev_context() const = 0; virtual ~NCCLComm() = default; }; diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index bfdf492962de3ec0c2469f1c65fa9fd4d182376a..9f049b6e248f72152566f7afcf00dc5d7fc0766b 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -27,8 +27,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, cudaStreamCaptureMode mode, int64_t pool_id) { auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto* dev_ctx = - reinterpret_cast(mutable_dev_ctx); + auto* dev_ctx = reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); // After PR(#43206), cudnn related initializations will change to lazy mode. @@ -66,8 +65,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, std::unique_ptr EndCUDAGraphCapture() { auto place = CUDAGraph::CapturingPlace(); auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto* dev_ctx = - reinterpret_cast(mutable_dev_ctx); + auto* dev_ctx = reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); dev_ctx->SetCUDAGraphAllocator(nullptr); return CUDAGraph::EndCapture(); diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h index cc76a04a769a76ce7efd4643a7de38b567cf4950..427901c1a7fd5570ab4fe75df0ea9c280bb4e843 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h @@ -621,7 +621,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_CUDA if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 507baf6c0f4de5d40de552dd8f0fc310c7cc948d..3628b7e0418fa192db3eff322d268c51c91e3862 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -65,8 +65,8 @@ static inline int RoundToPowerOfTwo(int n) { #ifdef WITH_NV_JETSON // The number of threads cannot be assigned 1024 in some cases when the device // is nano or tx2 . -template -inline void ChangeThreadNum(const CUDADeviceContext& context, +template +inline void ChangeThreadNum(const phi::GPUContext& context, int* num_thread, int alternative_num_thread = 512) { if (context.GetComputeCapability() == 53 || @@ -99,10 +99,9 @@ struct GpuLaunchConfig { * cuda performs better. And number of blocks should be greater (at least * 2x~4x) than number of SMs. Hence, SM count is took into account within * this function to determine the right number of threads per block. */ -inline GpuLaunchConfig GetGpuLaunchConfig1D( - const platform::CUDADeviceContext& context, - int64_t numel, - int vec_size = 1) { +inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, + int64_t numel, + int vec_size = 1) { PADDLE_ENFORCE_GE(numel, 0, platform::errors::InvalidArgument( @@ -146,8 +145,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D( return config; } -inline GpuLaunchConfig GetGpuLaunchConfig2D( - const platform::CUDADeviceContext& context, int x_dim, int y_dim) { +inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, + int x_dim, + int y_dim) { PADDLE_ENFORCE_GT( x_dim, 0, @@ -182,8 +182,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( template void LimitGridDim(const Context& ctx, dim3* grid_dim) { - auto max_grid_dim = reinterpret_cast(ctx) - .GetCUDAMaxGridDimSize(); + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2]; diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 1ce8038f0e3e2b6110d37a83fbb3d7bd1f0e191d..a5d89f6001fa18359aa60c661ace2e9526ae2d6f 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -120,11 +120,11 @@ class NCCLGroupGuard { }; struct NCCLContext { - std::unique_ptr ctx_; + std::unique_ptr ctx_; ncclComm_t comm_; explicit NCCLContext(int dev_id) : comm_{nullptr} { - ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id))); + ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id))); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(dev_id), ctx_->stream()) .get()); @@ -211,11 +211,9 @@ struct NCCLContextMap { NCCLContextMap(const NCCLContextMap &other) = delete; NCCLContextMap &operator=(const NCCLContextMap &other) = delete; - CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + phi::GPUContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } - CUDADeviceContext *DevCtx(platform::Place p) const { - return DevCtx(p.device); - } + phi::GPUContext *DevCtx(platform::Place p) const { return DevCtx(p.device); } const NCCLContext &at(platform::Place p) const { return this->at(p.device); } diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index ff7f64ef1bec13f4734161c5bdbb5c22ab4c2439..9cb5cdfbb164d49eb5ceb1d67ea9b88ed1d3a94c 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -558,7 +558,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_HIP if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index bd8d3f8a37230900bebbc0fc4136fef9131b6f1a..f91b420be0d8a9e303aa3eacd76565a6a4dca1ff 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -252,7 +252,7 @@ bool CUDADeviceCode::Compile(bool include_path) { } // Compile the program for specified compute_capability - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); std::vector options = {"-std=c++11", "--amdgpu-target=gfx906"}; @@ -329,7 +329,7 @@ bool CUDADeviceCode::Compile(bool include_path) { } // Compile the program for specified compute_capability - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); std::string compute_flag = @@ -416,7 +416,7 @@ void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { max_blocks, (static_cast(n) + workload_per_block - 1) / workload_per_block); - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 113862c6ec2d493ee0e29207793ecf2f566606e6..d38118d2a260a21745565f02a31acda5af334772 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -50,17 +50,16 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto* default_dev_ctx = static_cast( + auto* default_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); - auto& desired_dev_ctx = - static_cast(dev_ctx); + auto& desired_dev_ctx = static_cast(dev_ctx); if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size, phi::Stream(reinterpret_cast( desired_dev_ctx.stream()))); } else { - return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( + return allocation::GPUContextAllocatorPool::Instance().Alloc( desired_dev_ctx, size); } #else @@ -191,11 +190,11 @@ std::unique_ptr CreateDeviceContext( auto* dev_ctx = new DevCtx(p); if (is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto* cuda_ctx = dynamic_cast(dev_ctx); + auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, platform::errors::InvalidArgument( - "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); + "Failed to dynamic_cast dev_ctx into phi::GPUContext.")); auto& instance = memory::allocation::AllocatorFacade::Instance(); if (!disable_setting_default_stream_for_allocator) { @@ -271,7 +270,7 @@ void EmplaceDeviceContexts( #endif } else if (platform::is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - EmplaceDeviceContext( + EmplaceDeviceContext( place_to_device_context, p, disable_setting_default_stream_for_allocator); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 70b979aa9bb1024e688e85974ff591c5fdabbd53..6d08a0cc32b632aea7dac48a08b8093b4a1507ba 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -271,11 +271,9 @@ struct DefaultDeviceContextType { class CudnnWorkspaceHandle; class EigenCudaStreamDevice; -using CUDADeviceContext = phi::GPUContext; - class CudnnWorkspaceHandle { public: - inline CudnnWorkspaceHandle(const CUDADeviceContext& dev_ctx, std::mutex* mtx) + inline CudnnWorkspaceHandle(const phi::GPUContext& dev_ctx, std::mutex* mtx) : device_context_(dev_ctx), mtx_(mtx) {} template @@ -318,13 +316,13 @@ class CudnnWorkspaceHandle { private: memory::allocation::AllocationPtr allocation_; - const CUDADeviceContext& device_context_; + const phi::GPUContext& device_context_; std::mutex* mtx_; }; template <> struct DefaultDeviceContextType { - using TYPE = CUDADeviceContext; + using TYPE = phi::GPUContext; }; // Currently, CUDAPinnedDeviceContext is only used to data copying. diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 2db29dc11ada05b8392bc8bcaacfa6562734f333..abffa1e8846df40d2b9ef0fc0a5d6d2a781c87e6 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -19,13 +19,13 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" TEST(Device, Init) { - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContext; + using phi::GPUContext; int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i)); device_context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(i), device_context->stream()) @@ -50,13 +50,13 @@ TEST(Device, Init) { } } -TEST(Device, CUDADeviceContext) { - using paddle::platform::CUDADeviceContext; +TEST(Device, GPUContext) { using paddle::platform::CUDAPlace; + using phi::GPUContext; int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i)); device_context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(i), device_context->stream()) @@ -94,10 +94,10 @@ TEST(Device, CUDADeviceContext) { TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContextPool; using paddle::platform::Place; + using phi::GPUContext; DeviceContextPool& pool = DeviceContextPool::Instance(); auto cpu_dev_ctx1 = pool.Get(CPUPlace()); diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu index efb0d9ed7568960569cc2a1359d620880d6b740b..14967edbe4edf619968fff04dd70819ffb6fa0cb 100644 --- a/paddle/fluid/platform/device_context_test_cuda_graph.cu +++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" TEST(Device, DeviceContextWithCUDAGraph) { - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContext; using paddle::platform::DeviceContextPool; using paddle::platform::Place; + using phi::GPUContext; DeviceContextPool& pool = DeviceContextPool::Instance(); Place place = CUDAPlace(0); diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index c9d9b6915b15924799d6198197637704019e5c64..37da8daf7fd69b02cee2ec04ebfcfaf602713e73 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -49,12 +49,11 @@ void DeviceEventCreateCUDA(DeviceEvent* event, void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) { auto* wrapper = static_cast(event->GetEvent().get()); - auto* cuda_dev_ctx = - dynamic_cast(context); + auto* cuda_dev_ctx = dynamic_cast(context); PADDLE_ENFORCE_NOT_NULL( cuda_dev_ctx, platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into CUDADeviceContext.")); + "Failed to dynamic_cast context into phi::GPUContext.")); wrapper->inner_event_.Record(cuda_dev_ctx->stream()); } @@ -78,12 +77,11 @@ void DeviceEventFinishCUDA(const DeviceEvent* event) { void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, const DeviceContext* context) { auto* wrapper = static_cast(event->GetEvent().get()); - auto* cuda_dev_ctx = - dynamic_cast(context); + auto* cuda_dev_ctx = dynamic_cast(context); PADDLE_ENFORCE_NOT_NULL( cuda_dev_ctx, platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into CUDADeviceContext.")); + "Failed to dynamic_cast context into phi::GPUContext.")); // calling cudaStreamWaitEvent(stream, event, 0) cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent()); } diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 9fb423e782d10167a36cd61e66178a528777af78..7dfacc66437ae0a19ff9bafa7ed23418e151b89a 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -33,8 +33,7 @@ TEST(DeviceEvent, CUDA) { auto& pool = DeviceContextPool::Instance(); auto place = CUDAPlace(0); - auto* context = - static_cast(pool.Get(place)); + auto* context = static_cast(pool.Get(place)); ASSERT_NE(context, nullptr); // case 1. test for event_creator @@ -83,8 +82,7 @@ TEST(DeviceEvent, CUDA) { auto& pool = DeviceContextPool::Instance(); auto place = CUDAPlace(0); - auto* context = - static_cast(pool.Get(place)); + auto* context = static_cast(pool.Get(place)); ASSERT_NE(context, nullptr); // case 1. test for event_creator diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index d91cb6da2dcbbfc97e37a03c2393718e2dcc0aea..d6edb9ba9478c27232d218bfb50cc35e4bafd76a 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -328,7 +328,7 @@ TEST(float16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); - CUDADeviceContext gpu_ctx(gpu_place); + phi::GPUContext gpu_ctx(gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 5e0717ba635cedb6878beec347668599846afe5d..ce68452ffbe32df3ae449a949bf945a907f121ea 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -40,9 +40,9 @@ using paddle::memory::Alloc; using paddle::memory::Copy; using paddle::platform::CPUPlace; -using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using phi::CPUContext; +using phi::GPUContext; using paddle::platform::Transform; @@ -58,7 +58,7 @@ TEST(Transform, CPUUnary) { TEST(Transform, GPUUnary) { CUDAPlace gpu0(0); - CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -67,7 +67,7 @@ TEST(Transform, GPUUnary) { auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); float* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); @@ -89,7 +89,7 @@ TEST(Transform, CPUBinary) { TEST(Transform, GPUBinary) { int buf[4] = {1, 2, 3, 4}; CUDAPlace gpu0(0); - CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -97,7 +97,7 @@ TEST(Transform, GPUBinary) { auto gpu_allocation = Alloc(gpu0, sizeof(buf)); int* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index df5b2c27122d9aff20844bb58417b6d7e8703985..f93e9b6de922132df5ea2ebd6ee6b84e460cae45 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1257,7 +1257,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); #else - auto* context = new paddle::platform::CUDADeviceContext(place); + auto* context = new phi::GPUContext(place); context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, context->stream()) diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h index 5d06dddd9645b09f93b9cc91d5c23ec80b17ba99..9b5c24abc677d7181e86195b8e038cd8ae3e8927 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h @@ -243,9 +243,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context, grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16); } - auto stream = - reinterpret_cast(context) - .stream(); + auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { KeLstmForward(context) - .stream(); + auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { KeLstmBackward -struct GRUUnitFunctor { - static void compute(const paddle::platform::CUDADeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -93,8 +93,7 @@ struct GRUUnitFunctor { threads = dim3(32, 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -184,8 +183,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const paddle::platform::CUDADeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -236,8 +235,7 @@ struct GRUUnitGradFunctor { origin_mode); } - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, @@ -333,10 +331,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu index b2057cfc4f911c47039eee5f559d474519fac8f6..e3e8b6cc12407b9d73c173c42c7c356888d38ed0 100644 --- a/paddle/phi/kernels/funcs/lstm_compute.cu +++ b/paddle/phi/kernels/funcs/lstm_compute.cu @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/lstm_compute.h" #include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/lstm_kernel.h" -#include "paddle/phi/kernels/funcs/lstm_compute.h" namespace phi { namespace funcs { template -struct LstmUnitFunctor { - static void compute(const paddle::platform::CUDADeviceContext& context, +struct LstmUnitFunctor { + static void compute(const phi::GPUContext& context, LstmMetaValue value, int frame_size, int batch_size, @@ -43,8 +43,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const paddle::platform::CUDADeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const phi::GPUContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, @@ -67,10 +67,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index bbd160e35c7f43254f75a486bfdaa324d73af3f8..9f0c20ccf14dc0e9cef169e75b352c0d1eabda39 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -222,11 +222,10 @@ struct TensorSetConstantGPU { template void apply() const { - SetConstant functor; - functor( - reinterpret_cast(context_), - tensor_, - static_cast(value_)); + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, + static_cast(value_)); } const paddle::platform::DeviceContext& context_; @@ -255,8 +254,8 @@ __global__ void RowwiseAddKernel( } template -struct RowwiseAdd { - void operator()(const paddle::platform::CUDADeviceContext& context, +struct RowwiseAdd { + void operator()(const phi::GPUContext& context, const paddle::framework::Tensor& input, const paddle::framework::Tensor& vector, paddle::framework::Tensor* output) { @@ -294,18 +293,18 @@ struct RowwiseAdd { } }; -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -// template struct ColwiseSum; -// The ColwiseSum failed in debug +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug // mode, // and only failed for this case. So reimplemented it. template <> -void ColwiseSum::operator()( - const paddle::platform::CUDADeviceContext& context, +void ColwiseSum::operator()( + const phi::GPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* vector) { auto in_dims = input.dims(); @@ -320,28 +319,28 @@ void ColwiseSum::operator()( vector->numel())); paddle::framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - phi::funcs::GetBlas(context) - .GEMV(true, - static_cast(in_dims[0]), - static_cast(in_dims[1]), - 1.0, - input.data(), - one.data(), - 0.0, - vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[0]), + static_cast(in_dims[1]), + 1.0, + input.data(), + one.data(), + 0.0, + vector->data()); } -template struct RowwiseSum; -// template struct RowwiseSum; +template struct RowwiseSum; +// template struct RowwiseSum; // TODO(zcd): Following ColwiseSum format, need to confirm. -// The RowwiseSum failed in debug +// The RowwiseSum failed in debug // mode, // and only failed for this case. So reimplemented it. template <> -void RowwiseSum::operator()( - const paddle::platform::CUDADeviceContext& context, +void RowwiseSum::operator()( + const phi::GPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* vector) { auto in_dims = input.dims(); @@ -356,25 +355,25 @@ void RowwiseSum::operator()( vector->numel())); paddle::framework::Tensor one; one.mutable_data({size}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - phi::funcs::GetBlas(context) - .GEMV(true, - static_cast(in_dims[1]), - static_cast(in_dims[0]), - 1.0, - one.data(), - input.data(), - 0.0, - vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[1]), + static_cast(in_dims[0]), + 1.0, + one.data(), + input.data(), + 0.0, + vector->data()); } -template struct RowwiseMean; -template struct RowwiseMean; +template struct RowwiseMean; +template struct RowwiseMean; template -struct ElementwiseAddTo { - void operator()(paddle::platform::CUDADeviceContext* ctx, +struct ElementwiseAddTo { + void operator()(phi::GPUContext* ctx, const paddle::framework::Tensor& src, paddle::framework::Tensor* dst) { auto in = paddle::framework::EigenVector::Flatten(src); @@ -384,10 +383,8 @@ struct ElementwiseAddTo { } }; -template struct ElementwiseAddTo; -template struct ElementwiseAddTo; +template struct ElementwiseAddTo; +template struct ElementwiseAddTo; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu index a66030e642628e8afaea7766ec96b7a23faf429b..196ca7a2ef96ea31489d2d66cc87dd98b28b1483 100644 --- a/paddle/phi/kernels/funcs/sequence2batch.cu +++ b/paddle/phi/kernels/funcs/sequence2batch.cu @@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const paddle::platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const paddle::framework::Tensor& src, paddle::framework::Vector index_lod, paddle::framework::Tensor* dst, @@ -90,19 +90,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index e0b7bba50d6e9bc3200b69e176f58062289dbd37..657430e1e758d9d53a33f74c5f05a2f87144c392 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -79,8 +79,7 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - auto* ctx = reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, input, input_width, @@ -131,9 +130,7 @@ void TopkKernel(const Context& dev_ctx, dev_ctx.template Alloc(&sorted_output); dev_ctx.template Alloc(&sorted_indices); dev_ctx.template Alloc(&gather_indices); - auto* ctx = - reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, out, k, @@ -239,8 +236,7 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - auto* ctx = reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, &trans_input, input_width, diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu index 853187fc802a07574c10e615728ceacb8617bbb9..479d874626a4e02ad31222798eca2958539e06dd 100644 --- a/paddle/phi/tests/kernels/test_math_function.cu +++ b/paddle/phi/tests/kernels/test_math_function.cu @@ -37,9 +37,9 @@ void fill_fp16_data(phi::dtype::float16* in_ptr, } template -inline phi::funcs::BlasT GetBlas( - const paddle::platform::CUDADeviceContext& context) { - return phi::funcs::GetBlas(context); +inline phi::funcs::BlasT GetBlas( + const phi::GPUContext& context) { + return phi::funcs::GetBlas(context); } TEST(math_function, notrans_mul_trans_fp32) { @@ -51,7 +51,7 @@ TEST(math_function, notrans_mul_trans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -87,7 +87,7 @@ TEST(math_function, notrans_mul_trans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -134,7 +134,7 @@ TEST(math_function, trans_mul_notrans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -176,7 +176,7 @@ TEST(math_function, trans_mul_notrans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -229,7 +229,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -287,7 +287,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -364,7 +364,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -416,7 +416,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -485,7 +485,7 @@ void GemvTest(int m, int n, bool trans) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h index ffe89fde0470e95ad3b376cf4619e4598ee6d183..70919708e19ddda2fe799a0669209d3523b83ec5 100644 --- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h +++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h @@ -62,7 +62,7 @@ struct ReluFunctor { #if defined(__NVCC__) || defined(__HIPCC__) if (paddle::platform::is_gpu_place(place)) { - LAUNCH_RELU_KERNEL(paddle::platform::CUDADeviceContext); + LAUNCH_RELU_KERNEL(phi::GPUContext); return; } #endif