From 8676302364924bb190dcf171da7cf30d290aa2a6 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 1 Aug 2022 19:40:30 +0800 Subject: [PATCH] unify gpu context (#44740) * remove cudaDeviceContext * remove more template * fix rocm compile * remove alias name CUDADeviceContext * fix compile * fix tests * revert changes --- .../fluid/distributed/collective/HCCLTools.h | 2 +- .../fluid/distributed/collective/NCCLTools.h | 8 +- .../collective/ProcessGroupNCCL.cc | 12 +- .../distributed/collective/ProcessGroupNCCL.h | 4 +- .../fluid/distributed/collective/reducer.cc | 6 +- .../distributed/fleet_executor/dist_model.cc | 3 +- .../distributed/ps/service/brpc_utils.cc | 12 +- .../distributed/ps/service/heter_client.cc | 3 +- .../eager_generated/backwards/scale_node.cc | 17 +- paddle/fluid/eager/nan_inf_utils.cc | 3 +- .../performance_tests/benchmark_fluid_cuda.cc | 9 +- .../performance_tests/benchmark_utils.cc | 6 +- paddle/fluid/eager/tests/test_utils.h | 8 +- .../framework/data_device_transform_test.cu | 5 +- paddle/fluid/framework/data_feed.cc | 4 +- paddle/fluid/framework/data_feed.cu | 4 +- paddle/fluid/framework/data_type_transform.cc | 4 +- .../framework/data_type_transform_test.cu | 2 +- .../details/broadcast_op_handle_test.h | 2 +- .../details/eager_deletion_op_handle.cc | 2 +- .../details/eager_deletion_op_handle.h | 2 +- .../details/gather_op_handle_test.cc | 2 +- .../framework/details/nan_inf_utils_detail.cc | 3 +- .../framework/details/nan_inf_utils_detail.cu | 14 +- .../fluid/framework/details/op_handle_base.cc | 17 +- .../details/reduce_op_handle_test.cc | 2 +- .../details/scale_loss_grad_op_handle.cc | 2 +- paddle/fluid/framework/fleet/box_wrapper.cu | 6 +- paddle/fluid/framework/fleet/box_wrapper.h | 15 +- .../framework/fleet/heter_ps/feature_value.cu | 4 +- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 2 +- paddle/fluid/framework/garbage_collector.cc | 6 +- .../fluid/framework/heter_section_worker.cc | 3 +- paddle/fluid/framework/heterxpu_trainer.cc | 2 +- .../ir/fusion_group/code_generator_tester.cc | 2 +- paddle/fluid/framework/mixed_vector.cc | 4 +- paddle/fluid/framework/mixed_vector_test.cu | 2 +- .../framework/new_executor/interpretercore.cc | 5 +- paddle/fluid/framework/op_registry_test.cc | 14 +- paddle/fluid/framework/operator.h | 5 +- paddle/fluid/framework/parallel_executor.cc | 8 +- paddle/fluid/framework/phi_utils.h | 2 +- paddle/fluid/framework/tensor_util.cc | 31 ++-- paddle/fluid/framework/tensor_util.h | 65 ++++---- paddle/fluid/framework/tensor_util_test.cc | 36 ++-- paddle/fluid/imperative/all_reduce.cc | 4 +- .../fluid/imperative/gradient_accumulator.cc | 23 ++- paddle/fluid/imperative/nccl_context.cc | 10 +- paddle/fluid/imperative/reducer.cc | 18 +- paddle/fluid/imperative/reducer.cu | 9 +- .../tests/heter_ccl_context_test.cc | 2 +- .../imperative/tests/nccl_context_test.cc | 2 +- .../fluid/inference/api/analysis_predictor.cc | 11 +- paddle/fluid/inference/api/api_impl.cc | 3 +- .../inference/api/paddle_infer_contrib.cc | 3 +- paddle/fluid/inference/lite/tensor_utils.cc | 13 +- .../fluid/inference/lite/test_engine_lite.cc | 2 +- .../fluid/inference/lite/test_tensor_utils.cc | 3 +- .../tensorrt/convert/test_io_converter.cc | 2 +- .../inference/tensorrt/convert/ut_helper.h | 4 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 8 +- .../inference/tensorrt/test_dynamic_engine.cc | 8 +- .../fluid/inference/tensorrt/test_engine.cc | 4 +- .../allocation/best_fit_allocator_test.cu | 5 +- .../cuda_device_context_allocator.h | 64 ++++---- paddle/fluid/memory/malloc_test.cu | 14 +- .../memory/stream_safe_cuda_alloc_test.cu | 4 +- .../fluid/operators/activation_cudnn_op.cu.cc | 34 ++-- paddle/fluid/operators/activation_op.kps | 119 ++++++-------- paddle/fluid/operators/affine_channel_op.cu | 2 +- .../operators/affine_grid_cudnn_op.cu.cc | 4 +- paddle/fluid/operators/affine_grid_op.cu | 4 +- .../amp/check_finite_and_unscale_op.cu | 5 +- .../operators/amp/update_loss_scaling_op.cu | 12 +- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- paddle/fluid/operators/assign_pos_op.cu | 3 +- paddle/fluid/operators/batch_fc_op.cu | 18 +- paddle/fluid/operators/beam_search_op.cu.cc | 11 +- paddle/fluid/operators/cast_op.cu | 2 +- paddle/fluid/operators/center_loss_op.cu | 2 +- .../cinn/cinn_instruction_run_op.cu.cc | 3 +- .../fluid/operators/cinn/cinn_launch_op.cu.cc | 4 +- paddle/fluid/operators/cinn/cinn_op_helper.cc | 6 +- paddle/fluid/operators/cinn/cinn_op_helper.h | 3 +- .../fluid/operators/class_center_sample_op.cu | 7 +- paddle/fluid/operators/coalesce_tensor_op.cc | 9 +- .../operators/collective/allreduce_op.cu.cc | 13 +- .../fluid/operators/collective/allreduce_op.h | 2 +- .../operators/collective/alltoall_op.cu.cc | 2 +- .../operators/collective/barrier_op.cu.cc | 2 +- .../operators/collective/broadcast_op.cu.cc | 2 +- .../operators/collective/c_allgather_op.cu.cc | 2 +- .../operators/collective/c_allreduce_op.h | 2 +- .../operators/collective/c_broadcast_op.cu.cc | 2 +- .../operators/collective/c_concat_op.cu.cc | 6 +- .../operators/collective/c_embedding_op.cu | 6 +- .../fluid/operators/collective/c_reduce_op.h | 2 +- .../collective/c_reducescatter_op.cu.cc | 2 +- .../operators/collective/c_scatter_op.cu.cc | 2 +- .../c_softmax_with_cross_entropy_op.cu | 25 ++- .../fluid/operators/collective/c_split_op.cu | 2 +- .../collective/c_sync_calc_stream_op.h | 2 +- .../operators/collective/c_wait_comm_op.cc | 2 +- .../operators/collective/c_wait_compute_op.cc | 2 +- .../collective/global_gather_op.cu.cc | 2 +- .../collective/global_scatter_op.cu.cc | 2 +- .../collective/partial_allgather_op.cu.cc | 2 +- .../collective/partial_recv_op.cu.cc | 2 +- .../collective/partial_send_op.cu.cc | 2 +- .../operators/collective/recv_v2_op.cu.cc | 2 +- .../operators/collective/send_v2_op.cu.cc | 2 +- paddle/fluid/operators/conv_shift_op.cu | 24 +-- paddle/fluid/operators/conv_transpose_op.cc | 6 +- .../fluid/operators/copy_cross_scope_test.cc | 4 +- paddle/fluid/operators/correlation_op.cu | 14 +- paddle/fluid/operators/cos_sim_op.cu | 8 +- paddle/fluid/operators/crop_op.cc | 14 +- paddle/fluid/operators/cross_entropy_op.cu | 2 +- paddle/fluid/operators/ctc_align_op.cu | 7 +- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 18 +- paddle/fluid/operators/cvm_op.cu | 6 +- paddle/fluid/operators/data_norm_op.cu | 26 ++- .../operators/deformable_psroi_pooling_op.cu | 2 +- .../fluid/operators/dequantize_abs_max_op.cu | 10 +- paddle/fluid/operators/dequantize_log_op.cu | 8 +- .../fluid/operators/detail/strided_memcpy.h | 6 +- .../detection/anchor_generator_op.cu | 3 +- .../fluid/operators/detection/bbox_util.cu.h | 6 +- .../fluid/operators/detection/box_clip_op.cu | 7 +- .../detection/box_decoder_and_assign_op.cu | 6 +- .../detection/collect_fpn_proposals_op.cu | 14 +- .../detection/density_prior_box_op.cu | 3 +- .../detection/distribute_fpn_proposals_op.cu | 10 +- .../detection/generate_proposals_op.cu | 9 +- .../detection/generate_proposals_v2_op.cu | 9 +- .../operators/detection/iou_similarity_op.cu | 7 +- .../fluid/operators/detection/prior_box_op.cu | 3 +- .../detection/roi_perspective_transform_op.cu | 4 +- .../detection/sigmoid_focal_loss_op.cu | 11 +- .../operators/detection/target_assign_op.cu | 17 +- paddle/fluid/operators/dgc_clip_by_norm_op.cu | 5 +- paddle/fluid/operators/dgc_op.cu | 3 +- paddle/fluid/operators/diag_op.cu | 11 +- .../operators/dlnne/dlnne_engine_op_test.cc | 6 +- paddle/fluid/operators/dropout_op_test.cc | 2 +- paddle/fluid/operators/edit_distance_op.cu | 9 +- paddle/fluid/operators/eigvalsh_op.cu | 28 ++-- .../elementwise/elementwise_op_function.h | 4 +- paddle/fluid/operators/expand_as_op.cc | 24 ++- paddle/fluid/operators/expand_op.cc | 24 ++- paddle/fluid/operators/fake_dequantize_op.cu | 2 +- .../fluid/operators/fake_dequantize_op.cu.h | 16 +- paddle/fluid/operators/fake_quantize_op.cu | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 51 +++--- paddle/fluid/operators/fc_op.cu.cc | 9 +- paddle/fluid/operators/feed_forward_test.cu | 12 +- paddle/fluid/operators/fill_any_op.cu.cc | 26 ++- paddle/fluid/operators/fill_constant_op.h | 4 +- paddle/fluid/operators/fill_diagonal_op.cu | 1 - .../operators/fill_diagonal_tensor_op.cu | 6 +- .../fluid/operators/fill_zeros_like_op.cu.cc | 36 ++-- paddle/fluid/operators/flatten_op.cu.cc | 60 ++++--- paddle/fluid/operators/fold_op.cu | 14 +- paddle/fluid/operators/fsp_op.cu | 8 +- .../operators/fused/attention_layer_norm.h | 4 +- .../fluid/operators/fused/attn_bias_add.cu.h | 11 +- .../fluid/operators/fused/attn_feed_forward.h | 8 +- paddle/fluid/operators/fused/attn_gemm.h | 8 +- .../fluid/operators/fused/conv_fusion_op.cu | 12 +- .../operators/fused/cudnn_bn_add_relu_test.cc | 30 ++-- .../fused/cudnn_bn_stats_finalize.cu.h | 8 +- .../operators/fused/cudnn_norm_conv.cu.h | 22 +-- .../operators/fused/cudnn_norm_conv_test.cc | 32 ++-- .../fused/cudnn_scale_bias_add_relu.cu.h | 10 +- paddle/fluid/operators/fused/fmha_ref.h | 8 +- .../operators/fused/fused_attention_op.cu | 2 +- .../operators/fused/fused_bn_activation_op.cu | 24 ++- .../fused/fused_bn_add_activation_op.cu | 13 +- .../operators/fused/fused_dropout_act_bias.h | 4 +- .../fused/fused_dropout_act_bias_test.cu | 6 +- .../operators/fused/fused_dropout_common.h | 6 +- .../operators/fused/fused_dropout_helper.h | 25 ++- .../operators/fused/fused_dropout_test.h | 6 +- .../fused/fused_elemwise_activation_op.cu | 32 ++-- .../fused_embedding_eltwise_layernorm_op.cu | 3 +- .../fused_fc_elementwise_layernorm_op.cu | 4 +- .../operators/fused/fused_feedforward_op.cu | 24 ++- .../operators/fused/fused_gate_attention.h | 14 +- .../fused/fused_gate_attention_op.cu | 4 +- .../operators/fused/fused_gemm_epilogue_op.cu | 19 +-- .../fused_layernorm_residual_dropout_bias.h | 4 +- ...ed_layernorm_residual_dropout_bias_test.cu | 6 +- .../fused/fused_multi_transformer_op.cu | 6 +- .../fused/fused_residual_dropout_bias.h | 4 +- .../fused/fused_residual_dropout_bias_test.cu | 6 +- .../operators/fused/fused_seqpool_cvm_op.cu | 10 +- .../fused/fusion_conv_inception_op.cu | 2 +- .../operators/fused/fusion_group_op.cu.cc | 9 +- .../fusion_transpose_flatten_concat_op.cu.cc | 2 +- .../operators/fused/multihead_matmul_op.cu | 7 +- .../fluid/operators/fused/resnet_unit_op.cu | 4 +- .../operators/fused/skip_layernorm_op.cu | 5 +- .../fluid/operators/fused/yolo_box_head_op.cu | 3 +- .../fluid/operators/fused_softmax_mask_op.cu | 8 +- .../fused_softmax_mask_upper_triangle_op.cu | 11 +- .../fluid/operators/gather_scatter_kernel.cu | 6 +- paddle/fluid/operators/gaussian_random_op.cu | 3 +- .../fluid/operators/graph_khop_sampler_op.cu | 23 +-- .../operators/grid_sampler_cudnn_op.cu.cc | 4 +- paddle/fluid/operators/group_norm_op.cu | 28 ++-- paddle/fluid/operators/gru_op.cu.cc | 14 +- paddle/fluid/operators/gru_unit_op.cu | 14 +- paddle/fluid/operators/hinge_loss_op.cc | 10 +- paddle/fluid/operators/im2sequence_op.cc | 10 +- paddle/fluid/operators/inplace_abn_op.cu | 18 +- paddle/fluid/operators/interpolate_op.cu | 12 +- paddle/fluid/operators/isfinite_op.cu | 57 ++----- paddle/fluid/operators/l1_norm_op.cc | 8 +- .../fluid/operators/limit_by_capacity_op.cu | 3 +- paddle/fluid/operators/lite/lite_engine_op.h | 4 +- .../operators/lite/lite_engine_op_test.cc | 2 +- paddle/fluid/operators/lite/ut_helper.h | 2 +- paddle/fluid/operators/load_combine_op.cu | 13 +- paddle/fluid/operators/load_op.cu | 13 +- paddle/fluid/operators/lod_reset_op.cu | 22 ++- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- paddle/fluid/operators/lookup_table_op.cu | 3 +- paddle/fluid/operators/lookup_table_v2_op.cu | 3 +- paddle/fluid/operators/lrn_op.cu | 22 ++- paddle/fluid/operators/lstm_op.cu.cc | 14 +- paddle/fluid/operators/lstmp_op.cu | 14 +- paddle/fluid/operators/lstsq_op.cu | 67 ++++---- .../operators/margin_cross_entropy_op.cu | 17 +- paddle/fluid/operators/margin_rank_loss_op.cu | 10 +- paddle/fluid/operators/marker_op.cu | 2 +- paddle/fluid/operators/math/beam_search.cu | 12 +- .../fluid/operators/math/beam_search_test.cc | 12 +- .../operators/math/bert_encoder_functor.cu | 31 ++-- .../operators/math/bert_encoder_functor.h | 2 +- .../fluid/operators/math/concat_and_split.cu | 14 +- paddle/fluid/operators/math/concat_test.cc | 21 +-- .../fluid/operators/math/context_project.cu | 4 +- .../fluid/operators/math/cos_sim_functor.cu | 8 +- .../operators/math/eigen_values_vectors.h | 51 +++--- paddle/fluid/operators/math/gru_compute.cu | 20 +-- paddle/fluid/operators/math/im2col_test.cc | 16 +- paddle/fluid/operators/math/sample_prob.cu | 17 +- paddle/fluid/operators/math/sample_prob.h | 2 +- .../operators/math/selected_rows_functor.cu | 29 ++-- .../math/selected_rows_functor_test.cu.cc | 45 ++--- .../operators/math/sequence_padding_test.cc | 8 +- .../fluid/operators/math/sequence_pooling.cu | 16 +- .../operators/math/sequence_pooling_test.cc | 8 +- paddle/fluid/operators/math/tree2col.cu | 20 +-- paddle/fluid/operators/math/unpooling.cu | 32 ++-- paddle/fluid/operators/math/vol2col_test.cc | 16 +- paddle/fluid/operators/matmul_op.cc | 21 +-- paddle/fluid/operators/mean_iou_op.cu | 2 +- paddle/fluid/operators/memcpy_h2d_op.h | 3 +- .../operators/merge_selected_rows_op.cu.cc | 7 +- paddle/fluid/operators/minus_op.cc | 3 +- .../fluid/operators/modified_huber_loss_op.cu | 5 +- .../fluid/operators/nccl/nccl_op_test.cu.cc | 6 +- paddle/fluid/operators/number_count_op.cu | 3 +- paddle/fluid/operators/one_hot_op.cu | 7 +- .../operators/optimizers/cast_with_ptr.h | 4 +- .../optimizers/decayed_adagrad_op.cu | 5 +- .../operators/optimizers/dgc_momentum_op.cu | 5 +- .../distributed_fused_lamb_init_op.cu | 19 +-- .../optimizers/distributed_fused_lamb_op.cu | 51 +++--- paddle/fluid/operators/optimizers/ftrl_op.cu | 3 +- paddle/fluid/operators/optimizers/lamb_op.cu | 7 +- .../operators/optimizers/lars_momentum_op.cu | 51 +++--- .../pow2_decay_with_linear_warmup_op.cu | 4 +- .../optimizers/proximal_adagrad_op.cu | 5 +- .../operators/optimizers/proximal_gd_op.cu | 5 +- paddle/fluid/operators/optimizers/sgd_op.cu | 3 +- .../optimizers/sparse_momentum_op.cu | 7 +- paddle/fluid/operators/pad2d_op.cu | 4 +- .../fluid/operators/pad_constant_like_op.cc | 21 +-- paddle/fluid/operators/partial_concat_op.cu | 8 +- paddle/fluid/operators/partial_sum_op.cu | 8 +- paddle/fluid/operators/prroi_pool_op.cu | 7 +- .../operators/prune_gate_by_capacity_op.cu | 2 +- .../pscore/distributed_lookup_table_op.cu.cc | 2 +- .../pscore/distributed_push_sparse_op.cu.cc | 4 +- .../operators/pscore/send_and_recv_op.cc | 11 +- .../pscore/send_and_recv_op_gpu_test.cc | 5 +- paddle/fluid/operators/py_layer_op.cc | 30 ++-- paddle/fluid/operators/qr_op.cu | 155 +++++++++--------- paddle/fluid/operators/quantize_linear_op.cu | 10 +- paddle/fluid/operators/random_crop_op.cu | 2 +- paddle/fluid/operators/random_crop_op.h | 2 +- paddle/fluid/operators/random_routing_op.cu | 3 +- paddle/fluid/operators/rank_attention_op.cu | 18 +- paddle/fluid/operators/rank_loss_op.cc | 7 +- .../fluid/operators/reader/buffered_reader.cc | 4 +- .../fluid/operators/reduce_ops/reduce_op.cu.h | 2 +- paddle/fluid/operators/renorm_op.cu | 3 +- .../fluid/operators/repeat_interleave_op.cu | 33 ++-- paddle/fluid/operators/reshape_op.cc | 6 +- paddle/fluid/operators/row_conv_op.cu | 16 +- paddle/fluid/operators/run_program_op.cu.cc | 10 +- paddle/fluid/operators/sample_logits_op.cu | 4 +- paddle/fluid/operators/save_combine_op.cu | 11 +- paddle/fluid/operators/save_op.cu | 15 +- paddle/fluid/operators/seed_op.cu | 5 +- .../sequence_ops/sequence_concat_op.cu.cc | 24 +-- .../sequence_ops/sequence_conv_op.cu.cc | 14 +- .../sequence_ops/sequence_expand_as_op.cu | 29 ++-- .../sequence_ops/sequence_expand_op.cu | 30 ++-- .../sequence_ops/sequence_mask_op.cu | 12 +- .../operators/sequence_ops/sequence_pad_op.cu | 22 ++- .../sequence_ops/sequence_pool_op.cu | 10 +- .../sequence_ops/sequence_reshape_op.cu | 20 +-- .../sequence_ops/sequence_reverse_op.cu | 13 +- .../sequence_ops/sequence_slice_op.cu | 20 +-- .../sequence_softmax_cudnn_op.cu.cc | 10 +- .../sequence_ops/sequence_softmax_op.cc | 6 +- .../sequence_ops/sequence_softmax_op.cu | 20 +-- .../sequence_ops/sequence_unpad_op.cu | 20 +-- paddle/fluid/operators/shuffle_batch_op.cu | 11 +- paddle/fluid/operators/shuffle_channel_op.cu | 11 +- paddle/fluid/operators/slice_op.cc | 44 ++--- paddle/fluid/operators/smooth_l1_loss_op.cu | 10 +- paddle/fluid/operators/space_to_depth_op.cu | 22 ++- paddle/fluid/operators/sparse_attention_op.cu | 16 +- paddle/fluid/operators/spectral_op.cu | 42 ++--- paddle/fluid/operators/spectral_op.cu.h | 32 ++-- paddle/fluid/operators/spp_op.cu.cc | 14 +- .../fluid/operators/squared_l2_distance_op.cu | 8 +- paddle/fluid/operators/squeeze_op.cu.cc | 48 +++--- paddle/fluid/operators/stft_op.cu | 14 +- paddle/fluid/operators/strided_memcpy.h | 3 +- paddle/fluid/operators/strided_memcpy_test.cc | 4 +- paddle/fluid/operators/sum_op.cu | 34 ++-- paddle/fluid/operators/tensor_to_string.h | 2 +- .../operators/tensorrt/tensorrt_engine_op.h | 3 +- .../tensorrt/tensorrt_engine_op_test.cc | 6 +- .../test_leaky_relu_grad_grad_functor.h | 5 +- paddle/fluid/operators/top_k_function_cuda.h | 2 +- paddle/fluid/operators/top_k_op.cu | 28 ++-- paddle/fluid/operators/tree_conv_op.cu | 14 +- .../operators/uniform_random_inplace_op.cu | 5 +- paddle/fluid/operators/uniform_random_op.h | 3 +- paddle/fluid/operators/unpool_op.cu.cc | 28 ++-- paddle/fluid/operators/unsqueeze_op.cu.cc | 53 +++--- paddle/fluid/platform/bfloat16_test.cu | 2 +- paddle/fluid/platform/collective_helper.cc | 12 +- paddle/fluid/platform/collective_helper.h | 2 +- .../platform/cuda_graph_with_memory_pool.cc | 6 +- .../platform/device/gpu/cuda/cudnn_helper.h | 2 +- .../platform/device/gpu/gpu_launch_config.h | 20 +-- .../fluid/platform/device/gpu/nccl_helper.h | 10 +- .../platform/device/gpu/rocm/miopen_helper.h | 2 +- paddle/fluid/platform/device_code.cc | 6 +- paddle/fluid/platform/device_context.cc | 13 +- paddle/fluid/platform/device_context.h | 8 +- paddle/fluid/platform/device_context_test.cu | 12 +- .../device_context_test_cuda_graph.cu | 2 +- paddle/fluid/platform/device_event_gpu.cc | 10 +- paddle/fluid/platform/device_event_test.cc | 6 +- paddle/fluid/platform/float16_test.cu | 2 +- paddle/fluid/platform/transform_test.cu | 10 +- paddle/fluid/pybind/pybind.cc | 2 +- .../kernels/funcs/detail/lstm_gpu_kernel.h | 8 +- paddle/phi/kernels/funcs/gru_compute.cu | 22 ++- paddle/phi/kernels/funcs/lstm_compute.cu | 18 +- paddle/phi/kernels/funcs/math_function.cu | 95 ++++++----- paddle/phi/kernels/funcs/sequence2batch.cu | 22 +-- paddle/phi/kernels/gpu/top_k_kernel.cu | 10 +- .../phi/tests/kernels/test_math_function.cu | 24 +-- .../tests/custom_op/custom_raw_op_kernel_op.h | 2 +- 373 files changed, 1976 insertions(+), 2483 deletions(-) diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h index 6eb169d8fff..89ce00fe874 100644 --- a/paddle/fluid/distributed/collective/HCCLTools.h +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -94,7 +94,7 @@ class NPUEventManager { PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 197761dc3c3..c00b081438c 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -104,7 +104,7 @@ class EventManager { bool DeviceId() const { return device_index_; } gpuEvent_t GetRawCudaEvent() const { return event_; } - void Record(const paddle::platform::CUDADeviceContext& ctx) { + void Record(const phi::GPUContext& ctx) { auto device_index = ctx.GetPlace().device; if (!is_created_) { CreateEvent(device_index); @@ -112,7 +112,7 @@ class EventManager { PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); @@ -157,13 +157,13 @@ class EventManager { } } - void Block(const paddle::platform::CUDADeviceContext& ctx) const { + void Block(const phi::GPUContext& ctx) const { if (is_created_) { auto device_index = ctx.GetPlace().device; PADDLE_ENFORCE_EQ(device_index, device_index_, platform::errors::PreconditionNotMet( - "CUDADeviceContext's device %d does not match" + "phi::GPUContext's device %d does not match" "Event's device %d", device_index, device_index_)); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 81db9b94da9..d776f62373e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -31,10 +31,10 @@ namespace distributed { void SyncDefaultStream( const std::vector& places, - std::vector& ncclEvents, // NOLINT - std::vector>& dev_ctx) { // NOLINT + std::vector& ncclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT for (size_t i = 0; i < places.size(); ++i) { - auto* default_ctx = static_cast( + auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); ncclEvents[i].Record(*default_ctx); ncclEvents[i].Block(*dev_ctx[i]); @@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs( void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { for (size_t i = 0; i < places_.size(); ++i) { - auto* default_ctx = static_cast( + auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places_[i])); default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent()); } @@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); - std::vector> dev_ctx; + std::vector> dev_ctx; dev_ctx.resize(places.size()); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); @@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( for (size_t i = 0; i < places.size(); ++i) { platform::CUDADeviceGuard guard(places[i]); nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id); - dev_ctx[i].reset(new CUDADeviceContext(places[i])); + dev_ctx[i].reset(new phi::GPUContext(places[i])); } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 4dd44771d15..5adb6867eb8 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -45,7 +45,6 @@ namespace paddle { namespace distributed { using Place = paddle::platform::Place; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; class ProcessGroupNCCL : public ProcessGroup { public: @@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup { std::unordered_map> places_to_events_; - std::unordered_map>> + std::unordered_map>> places_to_ctx_; std::set used_place_ids_; diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 5f137c4d0af..8f4466f7baa 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context, void EagerGroup::ConcatTensors(const platform::Place &place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); ConcatTensorsWithType( *default_ctx, dense_tensors_, &dense_contents_, dtype_); @@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { void EagerGroup::SplitTensors(const platform::Place &place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); SplitTensorsWithType( *default_ctx, &dense_contents_, &dense_tensors_, dtype_); @@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); if (platform::is_gpu_place(inner_place_)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - dev_ctx = static_cast( + dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(inner_place_)); #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index e641d6311c6..0b46369b970 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, VLOG(3) << "Loading data for GPU."; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - dynamic_cast(pool.Get(place)); + auto *dev_ctx = dynamic_cast(pool.Get(place)); auto gpu_place = place; memory::Copy(gpu_place, static_cast(input_tensor_ptr), diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc index 3ed4277c61e..b98e85f9c23 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var, char* temp_ptr = new char[tensor->numel() * framework::DataTypeSize(tensor->dtype())]; // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), temp_ptr, @@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var, char* temp_ptr = new char[tensor->numel() * framework::DataTypeSize(tensor->dtype())]; // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), temp_ptr, @@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var, framework::DataTypeSize(tensor->dtype())]; // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), @@ -311,8 +308,7 @@ void DeserializeSelectedRows( unsigned long data_len; // NOLINT io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(temp_ptr, data_len); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index 91a20a432a3..84ef0b02bed 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx, std::vector temp; temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype())); char* temp_ptr = temp.data(); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(platform::CPUPlace(), temp_ptr, tensor->place(), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 1409119daf1..002b8330763 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (expected_kernel_place == paddle::platform::CUDAPlace()) { - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_place)); + auto* dev_ctx = + dynamic_cast(pool.Get(expected_kernel_place)); if (!dev_ctx) { PADDLE_THROW(paddle::platform::errors::Fatal( "Cannot convert device_context to CUDADeviceContext." "This indicates backend mismatch." "Pleas double check your expected place")); } - ScaleDeviceDispatch( - *dense_tensor.get(), - *dev_ctx, - scale, - bias, - bias_after_scale, - dense_out.get()); + ScaleDeviceDispatch(*dense_tensor.get(), + *dev_ctx, + scale, + bias, + bias_after_scale, + dense_out.get()); #endif } else { PADDLE_THROW(paddle::platform::errors::Fatal( diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index 6b2b9c9f34a..f8c06a5afff 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { auto& place = dense_tensor->place(); if (paddle::platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - paddle::framework::details::tensor_check< - paddle::platform::CUDADeviceContext>( + paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index f275e3f0bf1..6441ce1e788 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(place, mutable_x, @@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); auto* x_tensor = X->MutableVar()->GetMutable(); @@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) { for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); std::vector x_src_data(MLP_M * MLP_N, MLP_X_VAL); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 05ab86028da..b41938d4856 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr& X, if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), @@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue( if (place == paddle::platform::CUDAPlace()) { paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = - dynamic_cast(pool.Get(place)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h index 8540fc7e10d..0e62e5c2da6 100644 --- a/paddle/fluid/eager/tests/test_utils.h +++ b/paddle/fluid/eager/tests/test_utils.h @@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target, #ifdef PADDLE_WITH_CUDA paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(paddle::platform::CUDAPlace())); + auto* dev_ctx = + dynamic_cast(pool.Get(paddle::platform::CUDAPlace())); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), @@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target, #ifdef PADDLE_WITH_CUDA paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(paddle::platform::CUDAPlace())); + auto* dev_ctx = + dynamic_cast(pool.Get(paddle::platform::CUDAPlace())); auto stream = dev_ctx->stream(); paddle::memory::Copy(paddle::platform::CPUPlace(), diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 94e7918e800..cd76747c035 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT( paddle::framework::OpKernelTestProtoAndCheckerMaker); REGISTER_OP_CPU_KERNEL(test_op, paddle::framework::TestKernel); -REGISTER_OP_CUDA_KERNEL( - test_op, - paddle::framework::TestKernel); +REGISTER_OP_CUDA_KERNEL(test_op, + paddle::framework::TestKernel); static void BuildVar(const std::string& param_name, std::initializer_list arguments, diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 8ffb58f9451..4b5177aaa45 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place, const std::vector& infos) { place_ = place; - stream_ = dynamic_cast( + stream_ = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {} void MiniBatchGpuPack::reset(const paddle::platform::Place& place) { place_ = place; - stream_ = dynamic_cast( + stream_ = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); ins_num_ = 0; diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index d144673d62d..681fb1fdb29 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset( const int float_slot_size, const UsedSlotGpuType *used_slots) { auto stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(this->place_)) ->stream(); FillSlotValueOffsetKernel<<( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(this->place_)) ->stream(); diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 9333e246c68..59d20306c66 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -103,8 +103,8 @@ struct CastDataType { CastDataTypeFunctor()); #if defined(__NVCC__) || defined(__HIPCC__) } else if (platform::is_gpu_place(in_.place())) { - platform::Transform trans; - auto* context = static_cast(ctx_); + platform::Transform trans; + auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu index ed5b7fc692b..8490afd69d9 100644 --- a/paddle/fluid/framework/data_type_transform_test.cu +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -19,7 +19,7 @@ limitations under the License. */ TEST(DataTypeTransform, GPUTransform) { auto cpu_place = paddle::platform::CPUPlace(); auto gpu_place = paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 26ad71bafe6..154bf2b354e 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -105,7 +105,7 @@ struct TestBroadcastOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); place_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new phi::GPUContext(p)); } nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); #else diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index dafeb5cdb26..1e384143a3c 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( gc_(gc) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - dev_ctx_ = reinterpret_cast( + dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard(place.device); diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index a30e80b204d..0a92269c50a 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase { GarbageCollector *gc_; // not own std::vector vars_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDADeviceContext *dev_ctx_{nullptr}; + phi::GPUContext *dev_ctx_{nullptr}; gpuEvent_t event_{nullptr}; #endif }; diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 9cc1929e19a..45d8939f788 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -58,7 +58,7 @@ struct TestGatherOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new phi::GPUContext(p)); } #else PADDLE_THROW( diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 767f7b1e48b..ea292712610 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, if (platform::is_gpu_place(tensor->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - tensor_check( - op_type, var_name, *tensor, place); + tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.", diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 59bbef3a095..d91225a8141 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value, template <> template -void TensorCheckerVisitor::apply( +void TensorCheckerVisitor::apply( typename std::enable_if< std::is_floating_point::value || std::is_same>::value || @@ -143,7 +143,7 @@ void TensorCheckerVisitor::apply( const { int print_num = 3; - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( platform::DeviceContextPool::Instance().Get(tensor_.place())); int dev_id = tensor_.place().device; PADDLE_ENFORCE_EQ( @@ -226,13 +226,13 @@ void TensorCheckerVisitor::apply( } template <> -void tensor_check(const std::string& op_type, - const std::string& var_name, - const framework::Tensor& tensor, - const platform::Place& place) { +void tensor_check(const std::string& op_type, + const std::string& var_name, + const framework::Tensor& tensor, + const platform::Place& place) { std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap); - TensorCheckerVisitor vistor( + TensorCheckerVisitor vistor( op_type, var_name, tensor, place); VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 3d8cb208017..82f09f51c23 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { dev_ctx.second->Wait(); } } else { - auto stream = - static_cast(waited_ctx)->stream(); + auto stream = static_cast(waited_ctx)->stream(); for (auto &ev : events_) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); @@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = - static_cast(dev_ctxes_.at(place)) - ->stream(); + static_cast(dev_ctxes_.at(place))->stream(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); @@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = - static_cast(pool.Get(place)) - ->stream(); + static_cast(pool.Get(place))->stream(); platform::GpuStreamSync(stream); #else PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto stream = static_cast( + auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); #ifdef PADDLE_WITH_HIP @@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = p.first.device; - auto *cuda_dev_ctx = static_cast(p.second); - VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; + auto *cuda_dev_ctx = static_cast(p.second); + VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id; #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); @@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p, callback(); } else { auto *ctx = dev_ctxes_.at(p); - auto *cuda_ctx = static_cast(ctx); + auto *cuda_ctx = static_cast(ctx); cuda_ctx->RecordEvent(events_.at(p.device), callback); } #else diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 0d957bf8130..ad7888c0654 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -69,7 +69,7 @@ struct TestReduceOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CUDAPlace(i); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); + ctxs_.emplace_back(new p::phi::GPUContext(p)); } nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); #else diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index f0c152c34d3..b453e7c4a81 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -77,7 +77,7 @@ struct ScaleLossGradFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) OutT cast_coeff = static_cast(coeff_); - auto stream = static_cast(ctx_)->stream(); + auto stream = static_cast(ctx_)->stream(); memory::Copy(place_, out_data, platform::CPUPlace(), diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index c4bc5905aca..5f46906cf8e 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, const int hidden_size, const int expand_embed_dim, const int64_t total_length) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); @@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, const int64_t* gpu_len, int slot_num, int total_len) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); #ifdef PADDLE_WITH_HIP @@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, const int expand_embed_dim, const int64_t total_length, const int batch_size) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 297f4cb4796..c4cec547bd8 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -223,10 +223,10 @@ class AfsManager { delete read_stream; } int PopenBidirectionalInternal(const char* command, - FILE*& fp_read, // NOLINT - FILE*& fp_write, - pid_t& pid, // NOLINT - bool read, // NOLINT + FILE*& fp_read, // NOLINT + FILE*& fp_write, // NOLINT + pid_t& pid, // NOLINT + bool read, // NOLINT bool write) { std::lock_guard g(g_flock); int fd_read[2]; @@ -440,10 +440,9 @@ class BoxWrapper { std::vector stream_list; for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) { VLOG(3) << "before get context i[" << i << "]"; - platform::CUDADeviceContext* context = - dynamic_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(i))); + phi::GPUContext* context = dynamic_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(i))); stream_list_[i] = context->stream(); stream_list.push_back(&stream_list_[i]); } diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index ccc3575c42a..e57a02d7299 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -300,7 +300,7 @@ void AccessorWrapper::CopyForPullImpl( const int64_t total_length, int* gpu_dim, int feature_value_size) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); @@ -333,7 +333,7 @@ void AccessorWrapper::CopyForPushImpl( size_t grad_value_size, std::vector& slot_vector, std::vector& slot_mf_dim_vector) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto slot_lengths_lod = slot_lengths; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 0e806fdb5f5..36b789bdd11 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, const int64_t* gpu_len, int slot_num, int total_len) { - auto stream = dynamic_cast( + auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index c1f8041cc1e..77a666a24d9 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( : GarbageCollector(place, max_memory_size) {} void DefaultStreamGarbageCollector::Wait() const { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); + static_cast(this->dev_ctx_)->WaitStreamCallback(); } void DefaultStreamGarbageCollector::ClearCallback( const std::function &callback) { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); + static_cast(this->dev_ctx_)->AddStreamCallback(callback); } StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index 73139dee6e0..f5c226631e0 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope, char* temp_ptr = temp.data(); float* temp_ptr_float = reinterpret_cast(temp_ptr); temp_ptr_float[0] = micro_id; - auto stream = - reinterpret_cast(*dev_ctx).stream(); + auto stream = reinterpret_cast(*dev_ctx).stream(); memory::Copy( place, tensor_data, diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index da52af0faf3..0afeecd06b0 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, } } #ifdef PADDLE_WITH_CUDA - auto* dev_ctx = static_cast( + auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); PADDLE_ENFORCE_GPU_SUCCESS( cudaEventRecord(context->event_, dev_ctx->stream())); diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index ce5f4d743c6..690dea51632 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name, device_code.SetWorkloadPerThread(1); device_code.Launch(n, &args); - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); dev_ctx->Wait(); diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc index cb77542e262..c3c3581a6a7 100644 --- a/paddle/fluid/framework/mixed_vector.cc +++ b/paddle/fluid/framework/mixed_vector.cc @@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector *cpu_, size_t *gpu_memory_size_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // COPY GPU Data To CPU - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get((*gpu_)->place())); auto stream = dev_ctx->stream(); void *src = (*gpu_)->ptr(); @@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) (*gpu_) = memory::Alloc(place, *gpu_memory_size_); void *dst = (*gpu_)->ptr(); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 0eaf5dd69a5..61d256ffb22 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) { } gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { - return reinterpret_cast( + return reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); } diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 3680f0aa900..4b72d6bea34 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) { platform::RecordEvent record( "RecordStreamForGC", platform::TracerEventType::UserDefined, 10); - gpuStream_t stream = reinterpret_cast( - instr.DeviceContext()) - .stream(); + gpuStream_t stream = + reinterpret_cast(instr.DeviceContext()).stream(); auto TensorRecordStream = [&stream](Tensor& tensor) { auto allocation = tensor.Holder(); if (allocation == nullptr) { diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc index fa0528d4882..9ef577f6285 100644 --- a/paddle/fluid/framework/op_registry_test.cc +++ b/paddle/fluid/framework/op_registry_test.cc @@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::OpKernelTest); REGISTER_OP_CUDA_KERNEL( - op_with_kernel, - paddle::framework::OpKernelTest); + op_with_kernel, paddle::framework::OpKernelTest); TEST(OperatorRegistrar, CPU) { paddle::framework::proto::OpDesc op_desc; @@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) { } static int op_test_value = 0; -using paddle::platform::CUDADeviceContext; using paddle::platform::DeviceContext; using phi::CPUContext; +using phi::GPUContext; namespace paddle { namespace framework { @@ -301,7 +299,7 @@ class OpMultiKernelTest : public paddle::framework::OpKernel { }; template -class OpMultiKernelTest +class OpMultiKernelTest : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { @@ -325,7 +323,7 @@ class OpMultiKernelTest2 }; template -class OpMultiKernelTest2 +class OpMultiKernelTest2 : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { @@ -351,12 +349,12 @@ REGISTER_OP_KERNEL( op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, - paddle::framework::OpMultiKernelTest); + paddle::framework::OpMultiKernelTest); REGISTER_OP_KERNEL( op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, - paddle::framework::OpMultiKernelTest2); + paddle::framework::OpMultiKernelTest2); TEST(OperatorRegistrar, OpWithMultiKernel) { paddle::framework::proto::OpDesc op_desc; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 2568a459f31..cb6b2d832bf 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -416,13 +416,12 @@ class ExecutionContext { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - const inline platform::CUDADeviceContext& cuda_device_context() const { + const inline phi::GPUContext& cuda_device_context() const { PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true, platform::errors::PreconditionNotMet( "Current device context place is not GPUPlace.")); - return *reinterpret_cast( - &device_context_); + return *reinterpret_cast(&device_context_); } #endif diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4ad966887f3..26150b2d04b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices( nccl_ctxs->WaitAll(); } else { auto src_place = member_->places_[0]; - auto src_dev_ctx = static_cast( + auto src_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(src_place)); auto sizeof_dtype = framework::SizeOfType(dtype) * numel; for (size_t i = 1; i < member_->places_.size(); ++i) { auto dst_place = member_->places_[i]; - auto dst_dev_ctx = static_cast( + auto dst_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(dst_place)); src_dev_ctx->Wait(); dst_dev_ctx->Wait(); @@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { global_scope, member_->places_); auto &pool = platform::DeviceContextPool::Instance(); for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); + auto *dev_ctx = + static_cast(pool.Get(member_->places_[dev_id])); auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); dev_ctx->set_nccl_comm(nccl_ctx.comm()); } diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 6c8e8251579..050a51a0f10 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -72,7 +72,7 @@ struct ConvertToPhiContext { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -struct ConvertToPhiContext { +struct ConvertToPhiContext { using TYPE = phi::GPUContext; }; #endif diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index dbb549efa25..f7f05da6340 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src, "place is %s, context place is %s.", src_gpu_place, ctx_gpu_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && // NOLINT @@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src, "destination place is %s, context place is %s.", dst_gpu_place, ctx_gpu_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_gpu_place(src_place) && // NOLINT @@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src, "device context GPU number is %d.", src_gpu_place.device, ctx_gpu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } @@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src, "device context GPU number is %d.", dst_gpu_place.device, ctx_gpu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); memory::Copy( dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } @@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src, platform::errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); @@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); - auto& gpu_dev_ctx = - static_cast(dev_ctx); + auto& gpu_dev_ctx = static_cast(dev_ctx); platform::CPUPlace cpu; uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { @@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { platform::CUDAPlace(dl_tensor.device.device_id); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(*ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(*ctx).stream()); } #endif #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 79532172571..b1bba0f7c35 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -164,13 +164,12 @@ void TensorFromArray(const T* src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -242,13 +241,12 @@ void TensorFromVector(const std::vector& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector& src, } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src_place, - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src_place, + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src.place(), - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src.place(), + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) @@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src, } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, - dst_ptr, - src.place(), - src_ptr, - size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, + dst_ptr, + src.place(), + src_ptr, + size, + reinterpret_cast(ctx).stream()); } #endif #if defined(PADDLE_WITH_XPU) diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 74454a5a09b..36be5cde506 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) { // CPU Tensor to GPU Tensor auto gpu_place = new platform::CUDAPlace(0); - platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); @@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) { // Copy to GPUTensor gpu_tensor.Resize(phi::make_ddim({3, 3})); auto gpu_place = new paddle::platform::CUDAPlace(); - paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); @@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; - paddle::platform::CUDADeviceContext gpu_ctx(place); + phi::GPUContext gpu_ctx(place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, gpu_ctx.stream()) .get()); @@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) { #endif } -TEST(TensorToVector, Tensor_bool) { -{ - paddle::framework::Tensor src; - bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); - for (int i = 0; i < 3 * 3; ++i) { - src_ptr[i] = static_cast(i % 2); - } +TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src; +bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); +for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); +} - paddle::platform::CPUPlace place; - std::vector dst; - paddle::framework::TensorToVector(src, &dst); +paddle::platform::CPUPlace place; +std::vector dst; +paddle::framework::TensorToVector(src, &dst); - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_ptr[i], dst[i]); - } +for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); } +} // namespace framework #ifdef PADDLE_WITH_CUDA { @@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) { }; paddle::framework::Tensor gpu_tensor; paddle::platform::CUDAPlace place; - paddle::platform::CUDADeviceContext gpu_ctx(place); + phi::GPUContext gpu_ctx(place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, gpu_ctx.stream()) .get()); @@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) { } } #endif -} +} // namespace paddle TEST(TensorFromDLPack, Tensor) { { @@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) { Tensor dst_tensor; auto gpu_place = new platform::CUDAPlace(); - platform::CUDADeviceContext gpu_ctx(*gpu_place); + phi::GPUContext gpu_ctx(*gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index b948a191df7..c9d3d2591d0 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src, auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); auto nccl_dtype = platform::ToNCCLDataType(dtype); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); bool use_calc_stream = (dev_ctx->stream() == stream); @@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src, int ring_id, bool use_calc_stream) { const auto &place = GetVarPlace(src); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); platform::NCCLComm *comm = platform::NCCLCommContext::Instance().Get(ring_id, place); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index b656da34fb6..e6e156fa61c 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -122,10 +122,9 @@ class TensorAddFunctor #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void operator()(const platform::CUDAPlace& place) const { - platform::CUDADeviceContext* ctx = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = phi::funcs::GetBlas(*ctx); + phi::GPUContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = phi::funcs::GetBlas(*ctx); blas.AXPY(numel_, 1., x_, y_); } #else @@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) { if (data_type == framework::proto::VarType::FP16) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) { if (data_type == framework::proto::VarType::BF16) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); @@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); @@ -614,8 +613,8 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { - PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float); - PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double); + PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float); + PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double); } else { #endif PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 99c4a02e82b..94ac86e97e1 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -85,7 +85,7 @@ void NCCLParallelContext::Init() { VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " ring id: " << ring_id; - // it will assign nccl_comm in CUDADeviceContext within ring_id + // it will assign nccl_comm in phi::GPUContext within ring_id platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, @@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) { VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id << " ring id: " << ring_id; - // it will assign nccl_comm in CUDADeviceContext within ring_id + // it will assign nccl_comm in phi::GPUContext within ring_id platform::NCCLCommContext::Instance().CreateComm( &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); @@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) { ring_id, compute_events_.size())); - auto compute_stream = static_cast( + auto compute_stream = static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); auto comm_stream = @@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { ring_id, comm_events_.size())); - auto compute_stream = static_cast( + auto compute_stream = static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); auto comm_stream = @@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) { } void NCCLParallelContext::SynchronizeCompute() { - auto *compute_dev_ctx = static_cast( + auto *compute_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); compute_dev_ctx->Wait(); } diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 468263e7be7..1c3165a4538 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - ConcatTensorsWithType( - static_cast(context), - dense_tensors_, - &dense_contents_, - dtype_); + ConcatTensorsWithType(static_cast(context), + dense_tensors_, + &dense_contents_, + dtype_); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat grad tensors since it's not compiled with NCCL," @@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - SplitTensorsWithType( - static_cast(context), - &dense_contents_, - &dense_tensors_, - dtype_); + SplitTensorsWithType(static_cast(context), + &dense_contents_, + &dense_tensors_, + dtype_); #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split grad tensor since it's not compiled with NCCL," diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu index 5b29e568089..a3f840f38bf 100644 --- a/paddle/fluid/imperative/reducer.cu +++ b/paddle/fluid/imperative/reducer.cu @@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor, "Unsupport BF16 in DataParallel for now")); } framework::VisitDataTypeForHIP( - dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + dtype_, DivNRanksForAllReduce(tensor, nranks, context)); #else - framework::VisitDataType(dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + framework::VisitDataType( + dtype_, DivNRanksForAllReduce(tensor, nranks, context)); #endif } #endif diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc index 67059916d03..597a9a64669 100644 --- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc +++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc @@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { void AllReduceByStream(int local_rank, int device_id) { int data_size = 32; const auto& place = platform::CUDAPlace(device_id); - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); // heter_parallel_ctx imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id); diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 89938d2d7a2..13843ddbe5c 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) { int data_size = 4; float test_data = 7; const auto& place = platform::CUDAPlace(device_id); - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); imperative::NCCLParallelContext npc(GetStrategy(local_rank), place); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8971448071f..bde92c13b4c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, "Only one choice can be made between CPU and XPU.")); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - static_cast(pool.Get(place)); + auto *dev_ctx = static_cast(pool.Get(place)); auto dst_gpu_place = place; memory::Copy(dst_gpu_place, static_cast(input_ptr), @@ -283,7 +282,7 @@ bool AnalysisPredictor::Init( // NOTE: If the external_stream equals to global_device_contexts's stream, // then fallback. auto global_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); if (predictor_stream_ != global_stream) { @@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() { paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); auto gpu_place = place_; - auto *dev_ctx = static_cast( - pool.Get(gpu_place)); + auto *dev_ctx = static_cast(pool.Get(gpu_place)); #ifdef PADDLE_WITH_HIP hipStreamSynchronize(dev_ctx->stream()); #else @@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) { auto *pred = dynamic_cast(p->predictor_.get()); paddle::platform::DeviceContextPool &pool = paddle::platform::DeviceContextPool::Instance(); - auto *dev_ctx = reinterpret_cast( - pool.Get(pred->place_)); + auto *dev_ctx = reinterpret_cast(pool.Get(pred->place_)); cudaStreamSynchronize(dev_ctx->stream()); #endif } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 34dade3628a..2ba806a0529 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = - static_cast(pool.Get(place_)); + auto *dev_ctx = static_cast(pool.Get(place_)); auto dst_gpu_place = place_; memory::Copy(dst_gpu_place, static_cast(input_ptr), diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc index 171db0807e7..51b27f8ca3a 100644 --- a/paddle/fluid/inference/api/paddle_infer_contrib.cc +++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc @@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst, paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); paddle::platform::CUDAPlace gpu_place(dst.device_); - auto* dev_ctx = static_cast( - pool.Get(gpu_place)); + auto* dev_ctx = static_cast(pool.Get(gpu_place)); if (src.place() == PlaceType::kCPU) { paddle::memory::Copy(gpu_place, diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index e9ffbbd4494..454cd49d3ab 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place, } else if (platform::is_gpu_place(dst_place) && platform::is_gpu_place(src_place)) { auto gpu_place = src_place; - memory::Copy( - gpu_place, - dst_data, - gpu_place, - src_data, - size, - static_cast(ctx).stream()); + memory::Copy(gpu_place, + dst_data, + gpu_place, + src_data, + size, + static_cast(ctx).stream()); } #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 45b9d222c4c..e4054c5df67 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) { framework::Scope scope; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc index 43e1d8770c3..eea51e8ff1e 100644 --- a/paddle/fluid/inference/lite/test_tensor_utils.cc +++ b/paddle/fluid/inference/lite/test_tensor_utils.cc @@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) { TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - platform::GpuStreamSync( - static_cast(ctx).stream()); + platform::GpuStreamSync(static_cast(ctx).stream()); } #endif std::vector result; diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc index d770ef5478a..06555114164 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) { TEST(EngineIOConverterTester, DefaultGPU) { platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); IOConverterTester(ctx); } diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 2cdf3623306..9b80aeb1d49 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -124,7 +124,7 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDADeviceContext ctx(place_); + phi::GPUContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); @@ -172,7 +172,7 @@ class TRTConvertValidation { "But received batch_size:%d, max_batch_size_:%d", batch_size, max_batch_size_)); - platform::CUDADeviceContext ctx(place_); + phi::GPUContext ctx(place_); op_->Run(scope_, place_); cudaStreamSynchronize(stream_); std::vector input_output_names; diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 499c21723fe..9602e6c8790 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue( TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); - auto *device_ctx = static_cast( + auto *device_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); - const platform::CUDADeviceContext &dev_ctx = *device_ctx; + const phi::GPUContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, @@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue( TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); - auto *device_ctx = static_cast( + auto *device_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); @@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue( apply_scale<<>>( tptr, static_cast(scale_), n_q); - const platform::CUDADeviceContext &dev_ctx = *device_ctx; + const phi::GPUContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index 97b97aa3a4b..6ac23e32856 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -34,7 +34,7 @@ namespace tensorrt { class TensorRTDynamicEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test { framework::Tensor input_; framework::Tensor output_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTDynamicEngineTest, test_spmm) { @@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { std::vector inputs_; std::vector outputs_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 2836295f006..dc8065ab2a6 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -26,7 +26,7 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_ = new phi::GPUContext(platform::CUDAPlace(0)); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) .get()); @@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test { framework::Tensor input_; framework::Tensor output_; TensorRTEngine *engine_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu index 57782494eaf..44bcc10abae 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) { std::unique_ptr(new BestFitAllocator(cuda_allocation.get()))); platform::CUDAPlace gpu(0); - platform::CUDADeviceContext dev_ctx(gpu); + phi::GPUContext dev_ctx(gpu); dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu, dev_ctx.stream()) .get()); @@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) { size_t* data = reinterpret_cast(allocation->ptr()); ForEachFill fill(data); - platform::ForRange for_range(dev_ctx, - allocate_size); + platform::ForRange for_range(dev_ctx, allocate_size); for_range(fill); memory::Copy(platform::CPUPlace(), diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h index 662bcc401bd..f7e74e04212 100644 --- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -29,53 +29,51 @@ namespace memory { namespace allocation { /** - * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation. - * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath - * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream + * GPUContextAllocation is a wrapper of the underbeneath allocation. + * GPUContextAllocation adds a CUDA stream callback for the underbeneath + * allocation so that GPUContextAllocation can be used in a CUDA stream * which deletes allocation in the callback. */ -class CUDADeviceContextAllocation : public Allocation { +class GPUContextAllocation : public Allocation { public: - explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation) + explicit GPUContextAllocation(DecoratedAllocationPtr allocation) : Allocation(allocation->ptr(), allocation->base_ptr(), allocation->size(), allocation->place()), underlying_allocation_(std::move(allocation)) {} - ~CUDADeviceContextAllocation() { + ~GPUContextAllocation() { PADDLE_ENFORCE_NOT_NULL( dev_ctx_, platform::errors::PreconditionNotMet( - "Device context is not set for CUDADeviceContextAllocation")); + "Device context is not set for GPUContextAllocation")); auto *p_allocation = underlying_allocation_.release(); - VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at " + VLOG(4) << "Adding callback to delete GPUContextAllocation at " << p_allocation; dev_ctx_->AddStreamCallback([p_allocation] { - VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; + VLOG(4) << "Delete GPUContextAllocation at " << p_allocation; Allocator::AllocationDeleter(p_allocation); }); } - void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) { - dev_ctx_ = dev_ctx; - } + void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; } private: DecoratedAllocationPtr underlying_allocation_; - const platform::CUDADeviceContext *dev_ctx_{nullptr}; + const phi::GPUContext *dev_ctx_{nullptr}; }; /** - * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation + * GPUContextAllocator will allocate a GPUContextAllocation * after waiting for a self-created event on the default stream. It does so to * let the non-default stream be able to allocate GPU memory which will be * released by stream callback */ -class CUDADeviceContextAllocator : public Allocator { +class GPUContextAllocator : public Allocator { public: - explicit CUDADeviceContextAllocator(platform::CUDAPlace place, - gpuStream_t default_stream) + explicit GPUContextAllocator(platform::CUDAPlace place, + gpuStream_t default_stream) : place_(place), default_stream_(default_stream) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP @@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator { #endif } - ~CUDADeviceContextAllocator() { + ~GPUContextAllocator() { if (event_) { platform::CUDADeviceGuard guard(place_.device); #ifdef PADDLE_WITH_HIP @@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator { PADDLE_ENFORCE_NOT_NULL( default_stream_, platform::errors::PreconditionNotMet( - "Default stream is not set for CUDADeviceContextAllocator")); + "Default stream is not set for GPUContextAllocator")); platform::CUDADeviceGuard guard(place_.device); - auto allocation = new CUDADeviceContextAllocation( + auto allocation = new GPUContextAllocation( static_unique_ptr_cast(memory::Alloc(place_, size))); // Wait for the event on stream #ifdef PADDLE_WITH_HIP @@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator { }; /** - * CUDADeviceContextAllocatorPool is a singletion stores mapping from - * CUDAPlace(s) to std::shared_ptr. When a - * CUDADeviceContext's compute stream isn't default stream, it can call this + * GPUContextAllocatorPool is a singletion stores mapping from + * CUDAPlace(s) to std::shared_ptr. When a + * phi::GPUContext's compute stream isn't default stream, it can call this * class to allocate GPU memory which will be released by a callback after * stream execution. */ -class CUDADeviceContextAllocatorPool { +class GPUContextAllocatorPool { public: - static CUDADeviceContextAllocatorPool &Instance() { - static CUDADeviceContextAllocatorPool pool; + static GPUContextAllocatorPool &Instance() { + static GPUContextAllocatorPool pool; return pool; } - AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { + AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) { auto iter = allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId())); PADDLE_ENFORCE_NE( @@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool { platform::errors::NotFound("No allocator found for CUDAPlace.")); auto &allocator = iter->second; AllocationPtr allocation = allocator->Allocate(size); - static_cast(allocation.get()) - ->SetCUDADeviceContext(&dev_ctx); + static_cast(allocation.get()) + ->SetGPUContext(&dev_ctx); return allocation; } private: - CUDADeviceContextAllocatorPool() { + GPUContextAllocatorPool() { std::vector devices = platform::GetSelectedDevices(); for (int i : devices) { auto place = platform::CUDAPlace(i); auto compute_stream = platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); - auto allocator = std::shared_ptr( - new CUDADeviceContextAllocator(place, compute_stream)); + auto allocator = std::shared_ptr( + new GPUContextAllocator(place, compute_stream)); allocators_.insert(make_pair(place, allocator)); } } - std::map> + std::map> allocators_; }; diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 05e712e72f2..b3308ffdd30 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -37,7 +37,7 @@ const int NUM_STREAMS = 8; const int N = 2; const float DELTA = 1e-1; -using CudaDevCtxVec = std::vector>; +using CudaDevCtxVec = std::vector>; __global__ void kernel(float *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) { void MultiStreamCompute(float **data, float **second_data, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // multi-streams AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); @@ -88,7 +88,7 @@ void MultiStreamCompute(float **data, #endif } -TEST(Malloc, CUDADeviceContextMultiStream) { +TEST(Malloc, GPUContextMultiStream) { auto place = platform::CUDAPlace(0); platform::SetDeviceId(0); @@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - auto ctx = std::unique_ptr( - new platform::CUDADeviceContext(place)); + auto ctx = std::unique_ptr(new phi::GPUContext(place)); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); @@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { } } -TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { +TEST(Malloc, GPUContextMultiThreadMultiStream) { auto place = platform::CUDAPlace(0); platform::SetDeviceId(0); @@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { main_stream_alloc_ptr.reset(); for (int i = 0; i < NUM_STREAMS; ++i) { - auto ctx = std::unique_ptr( - new platform::CUDADeviceContext(place)); + auto ctx = std::unique_ptr(new phi::GPUContext(place)); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 96831b6bafc..67f2df8cda5 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { allocation_implicit_stream.reset(); gpuStream_t default_stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = @@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { size_t alloc_size = 256; gpuStream_t default_stream = - dynamic_cast( + dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); std::shared_ptr allocation_implicit_stream = diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index fae675142bc..49f78715c2c 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -19,8 +19,8 @@ namespace paddle { namespace operators { using framework::Tensor; +using phi::GPUContext; using platform::ActivationDescriptor; -using platform::CUDADeviceContext; using platform::TensorDescriptor; #ifdef PADDLE_WITH_HIP @@ -39,12 +39,12 @@ template struct CudnnActivationFunctor { using ELEMENT_TYPE = T; #ifdef PADDLE_WITH_HIP - CudnnActivationFunctor(const CUDADeviceContext& ctx, + CudnnActivationFunctor(const phi::GPUContext& ctx, const T& c, const miopenActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} #else - CudnnActivationFunctor(const CUDADeviceContext& ctx, + CudnnActivationFunctor(const phi::GPUContext& ctx, const T& c, const cudnnActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} @@ -77,7 +77,7 @@ struct CudnnActivationFunctor { out->mutable_data(ctx_.GetPlace()))); #endif } - const CUDADeviceContext& ctx_; + const phi::GPUContext& ctx_; const T coef_; #ifdef PADDLE_WITH_HIP const miopenActivationMode_t mode_; @@ -90,12 +90,12 @@ template struct CudnnActivationGradFunctor { using ELEMENT_TYPE = T; #ifdef PADDLE_WITH_HIP - CudnnActivationGradFunctor(const CUDADeviceContext& ctx, + CudnnActivationGradFunctor(const phi::GPUContext& ctx, const T& c, const miopenActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} #else - CudnnActivationGradFunctor(const CUDADeviceContext& ctx, + CudnnActivationGradFunctor(const phi::GPUContext& ctx, const T& c, const cudnnActivationMode_t& m) : ctx_(ctx), coef_(c), mode_(m) {} @@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor { dx->mutable_data(ctx_.GetPlace()))); #endif } - const CUDADeviceContext& ctx_; + const phi::GPUContext& ctx_; const T coef_; #ifdef PADDLE_WITH_HIP const miopenActivationMode_t mode_; @@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor { template struct CudnnReluFunctor : public CudnnActivationFunctor { - explicit CudnnReluFunctor(const CUDADeviceContext& ctx) + explicit CudnnReluFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} }; template struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnReluGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { template struct CudnnRelu6Functor : public CudnnActivationFunctor { - explicit CudnnRelu6Functor(const CUDADeviceContext& ctx) + explicit CudnnRelu6Functor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} }; template struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { - explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) + explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor( ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} @@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { template struct CudnnSigmoidFunctor : public CudnnActivationFunctor { - explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx) + explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} }; template struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { template struct CudnnTanhFunctor : public CudnnActivationFunctor { - explicit CudnnTanhFunctor(const CUDADeviceContext& ctx) + explicit CudnnTanhFunctor(const phi::GPUContext& ctx) : CudnnActivationFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} }; template struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { - explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) + explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} static constexpr ActBwdOpFwdDeps FwdDeps() { @@ -221,7 +221,7 @@ class CudnnActivationKernel framework::Tensor* Out = nullptr; ExtractActivationTensor(context, &X, &Out); Out->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = context.template device_context(); Functor functor(dev_ctx); functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out); } @@ -242,7 +242,7 @@ class CudnnActivationGradKernel ExtractActivationGradTensor( context, &X, &Out, &dOut, &dX); dX->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = context.template device_context(); Functor functor(dev_ctx); functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"), GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"), diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 9aafb70c7dc..76a05aa37a6 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor = namespace ops = paddle::operators; namespace plat = paddle::platform; -#define REGISTER_ACTIVATION_CUDA_KERNEL( \ - act_type, op_name, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); -#define REGISTER_ACTIVATION_CUDA_KERNEL_INT( \ - act_type, op_name, functor, grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ - ops::ActivationGradCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); REGISTER_OP_CUDA_KERNEL( relu6, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( relu6_grad, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index f638f6943ff..8fcdb323884 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(affine_channel, ops::AffineChannelCUDAKernel, diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc index a5b3f9fcfda..48832ac1d6d 100644 --- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel { platform::errors::InvalidArgument( "Only support for CUDAPlace.Please switch your context from " "CPUPlace to CUDAPlace or update your cudnn.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* theta = ctx.Input("Theta"); auto* output = ctx.Output("Output"); @@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel { "support for CUDAPlace. Please switch " "your context from CPUPlace to " "CUDAPlace or update your cudnn.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto output_grad = ctx.Input(framework::GradVarName("Output")); auto theta_grad = ctx.Output(framework::GradVarName("Theta")); diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu index b1ed3835e75..a5d4c6484a1 100644 --- a/paddle/fluid/operators/affine_grid_op.cu +++ b/paddle/fluid/operators/affine_grid_op.cu @@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) { } template -struct Linspace { +struct Linspace { void operator()(T start, T end, int count, @@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel { w = size_attr[3]; } T* theta_grad_data = theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); - phi::funcs::SetConstant()( + phi::funcs::SetConstant()( ctx.cuda_device_context(), theta_grad, static_cast(0)); T h_step; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 0338fb5d2f2..35b667825af 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const auto xs = ctx.MultiInput("X"); const auto* scale = ctx.Input("Scale"); auto outs = ctx.MultiOutput("Out"); @@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); framework::Tensor inverse_scale = - ctx.AllocateTmpTensor({1}, - dev_ctx); + ctx.AllocateTmpTensor({1}, dev_ctx); MPDType* inverse_scale_v = inverse_scale.template data(); InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index d76dd13e5bc..4c927066892 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs, } template -class UpdateLossScalingFunctor { +class UpdateLossScalingFunctor { public: - void operator()(const platform::CUDADeviceContext& dev_ctx, + void operator()(const phi::GPUContext& dev_ctx, const bool* found_inf_data, const T* pre_loss_scaling_data, const int* good_in_data, @@ -134,9 +132,9 @@ class UpdateLossScalingFunctor -class LazyZeros { +class LazyZeros { public: - void operator()(const platform::CUDADeviceContext& dev_ctx, + void operator()(const phi::GPUContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { @@ -204,7 +202,7 @@ class LazyZeros { namespace ops = paddle::operators; namespace plat = paddle::platform; -using GPU = paddle::platform::CUDADeviceContext; +using GPU = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(update_loss_scaling, ops::UpdateLossScalingKernel, diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index a2af64e2276..5fee66d968b 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function { Apply(static_cast(pool.Get(place))); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - Apply(static_cast(pool.Get(place))); + Apply(static_cast(pool.Get(place))); #else PADDLE_THROW( platform::errors::Unavailable("Paddle is not compiled with CUDA.")); diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index 164744527e2..3f36e8b1347 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel { *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len); cpu_eff_num_len_data = cpu_eff_num_len.data()[0]; } - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data}); auto out_data = out->mutable_data(out_dims, place); diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index e97f1261845..362489e51ac 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel { T* out_data = output->mutable_data(ctx.GetPlace()); // initialize auto out_eigen = framework::EigenVector::Flatten(*output); - auto& dev_ctx = ctx.template device_context(); - auto& place = *ctx.template device_context() - .eigen_device(); + auto& dev_ctx = ctx.template device_context(); + auto& place = + *ctx.template device_context().eigen_device(); out_eigen.device(place) = out_eigen.constant(static_cast(0)); CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel { int64_t strideA = ins_num * in_dim; int64_t strideB = in_dim * out_dim; - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.BatchedGEMM(transA, transB, ins_num, @@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { auto in_dim = input_dims[2]; auto out_dim = w_dims[2]; - auto& dev_ctx = ctx.template device_context(); - auto& place = *ctx.template device_context() - .eigen_device(); + auto& dev_ctx = ctx.template device_context(); + auto& place = + *ctx.template device_context().eigen_device(); // initialize dx->mutable_data(ctx.GetPlace()); auto dx_eigen = framework::EigenVector::Flatten(*dx); @@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { out_dim, db_data); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); T alpha = 1; T beta = 0; @@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(batch_fc, ops::BatchFCCUDAKernel, ops::BatchFCCUDAKernel); diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc index 15aca070221..93f538e6789 100644 --- a/paddle/fluid/operators/beam_search_op.cu.cc +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -17,9 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); +REGISTER_OP_CUDA_KERNEL(beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index a4fa631f741..7afb3f1135d 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc REGISTER_OP_CUDA_KERNEL(transfer_dtype, ops::CastOpKernel, diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index bec1bb662de..2548b135591 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(center_loss, ops::CenterLossCUDAKernel, ops::CenterLossCUDAKernel); diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc index afa350ef116..ae9dd3401fd 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc @@ -17,8 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL( cinn_instruction_run, - ops::CinnInstructionRunOpKernel); + ops::CinnInstructionRunOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc index 64980dfb013..7dbf2fee0c2 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc @@ -18,6 +18,4 @@ limitations under the License. */ /* see [Why use single type kernel] */ REGISTER_OP_CUDA_KERNEL( - cinn_launch, - paddle::operators::CinnLaunchOpKernel); + cinn_launch, paddle::operators::CinnLaunchOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc index 26fee2d9e57..48efa5c5116 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.cc +++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc @@ -21,10 +21,8 @@ namespace paddle::operators::details { #ifdef PADDLE_WITH_CUDA template <> -void* GetStream( - const framework::ExecutionContext& ctx) { - const auto& dev_ctx = - ctx.template device_context(); +void* GetStream(const framework::ExecutionContext& ctx) { + const auto& dev_ctx = ctx.template device_context(); return dev_ctx.stream(); } #endif diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h index 55ee3789c0a..4387095fefa 100644 --- a/paddle/fluid/operators/cinn/cinn_op_helper.h +++ b/paddle/fluid/operators/cinn/cinn_op_helper.h @@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) { #ifdef PADDLE_WITH_CUDA template <> -void* GetStream( - const framework::ExecutionContext& ctx); +void* GetStream(const framework::ExecutionContext& ctx); #endif } // namespace details diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index a0642694843..b92062b1aee 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); // use global calculate stream const auto calcu_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) ->stream(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( @@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( class_center_sample, - ops::ClassCenterSampleCUDAKernel, - ops::ClassCenterSampleCUDAKernel); + ops::ClassCenterSampleCUDAKernel, + ops::ClassCenterSampleCUDAKernel); diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 561d2696fef..4a11e6d5723 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( coalesce_tensor, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #endif #if defined(PADDLE_WITH_ASCEND_CL) diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index af299fc6b5a..174a5afa69d 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -17,10 +17,9 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - allreduce, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel); +REGISTER_OP_CUDA_KERNEL(allreduce, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel); diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h index 12708ab666d..12507d76fe7 100644 --- a/paddle/fluid/operators/collective/allreduce_op.h +++ b/paddle/fluid/operators/collective/allreduce_op.h @@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel { platform::errors::PreconditionNotMet( "AllReduce op can run on gpu place only for now.")); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto in = ctx.Input("X"); auto out = ctx.Output("Out"); diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc index ef59772b173..718f60c7737 100644 --- a/paddle/fluid/operators/collective/alltoall_op.cu.cc +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index c59d8315a36..de15395eb4d 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel { int rid = ctx.Attr("ring_id"); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto stream = static_cast(dev_ctx)->stream(); + auto stream = static_cast(dev_ctx)->stream(); ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc index ceac881bff1..4f21dc2992a 100644 --- a/paddle/fluid/operators/collective/broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc @@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel { platform::errors::PreconditionNotMet("Currently, the broadcast op can " "only be an In-Place operation.")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto comm = dev_ctx.nccl_comm(); auto stream = dev_ctx.stream(); diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index e9228a28dba..8356bbb65a8 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index e679fb2fe9c..718c77aaa6f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 33e320816de..e43c67d7bf3 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc index 3fb2047dc27..74bdd2b63ae 100644 --- a/paddle/fluid/operators/collective/c_concat_op.cu.cc +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel { T* recv_buff = temp_out.data(); gpuStream_t stream = nullptr; auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::ncclAllGather(send_buff, @@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel { offset += rows_per_tensor; } - math::ConcatFunctor functor; + math::ConcatFunctor functor; out->mutable_data(out_dims, place); - auto& dev_ctx2 = ctx.template device_context(); + auto& dev_ctx2 = ctx.template device_context(); functor(dev_ctx2, inputs, axis, out); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 3f14c0ac9c1..53aef8e8357 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel { auto *ids_t = context.Input("Ids"); auto *output_t = context.Output("Out"); - const auto &dev_ctx = - context.template device_context(); + const auto &dev_ctx = context.template device_context(); const int64_t start_idx = context.Attr("start_index"); size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; @@ -142,8 +141,7 @@ template class CEmbeddingGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const auto &dev_ctx = - context.template device_context(); + const auto &dev_ctx = context.template device_context(); const int64_t start_idx = context.Attr("start_index"); auto ids_t = context.Input("Ids"); auto d_output_t = context.Input(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index f2e6cdbe2ca..dae4fa497f7 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index 33617d8787d..354c31c213b 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc index b7e6262b81e..42d9ed2342c 100644 --- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc @@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index de83bc773ba..ef7e298aaf6 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor { const auto& place = ctx.GetPlace(); const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); // use global calculate stream - const auto stream = static_cast( + const auto stream = static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 1, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* logits_max_buff = logits_max.mutable_data(place); auto eigen_logits_max = math::EigenMatrix::From(logits_max); @@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 3, obtain predict target Tensor predicted_logits; predicted_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + ctx.AllocateTmpTensor({N, 1}, dev_ctx); predicted_logits.mutable_data(place); auto t = framework::EigenVector::Flatten(predicted_logits); @@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor { // step 5, obtain sum_exp_logits Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); @@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { const int rank = ctx.Attr("rank"); const auto& place = ctx.GetPlace(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto map = distributed::ProcessGroupMapFromGid::getInstance(); distributed::ProcessGroup* pg = map->get(rid); @@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 1, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); auto eigen_logits_max = math::EigenMatrix::From(logits_max); Eigen::DSizes along_axis(1); @@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 3, obtain predict target Tensor predicted_logits; predicted_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + ctx.AllocateTmpTensor({N, 1}, dev_ctx); predicted_logits.mutable_data(place); auto t = framework::EigenVector::Flatten(predicted_logits); @@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { // step 5, obtain sum_exp_logits Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); @@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); const int rank = context.Attr("rank"); - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); if (logit_grad != softmax) { framework::TensorCopy( diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 06c251e32cf..5b34e4ba9d5 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -83,7 +83,7 @@ class CSplitOpCUDAKernel : public framework::OpKernel { rank, nranks)); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto dims = x->dims(); auto dims_size = dims.size(); // final dim diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index f800be642f7..5b26e47a8fd 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -39,7 +39,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) auto place = ctx.GetPlace(); - auto dev_ctx = static_cast( + auto dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); platform::GpuStreamSync(dev_ctx->stream()); diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc index 4a60f255b47..bacbe014a34 100644 --- a/paddle/fluid/operators/collective/c_wait_comm_op.cc +++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc @@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase { int ring_id = Attr("ring_id"); auto compute_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto comm_stream = diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc index cec57c6bfd7..34569b0a4b6 100644 --- a/paddle/fluid/operators/collective/c_wait_compute_op.cc +++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc @@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase { int ring_id = Attr("ring_id"); auto compute_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); auto comm_stream = diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 2b8ba4049c5..3d7ab09f45e 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -83,7 +83,7 @@ struct GlobalGatherFunctor { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index b8b260c74ce..1337901f185 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -82,7 +82,7 @@ struct GlobalScatterFunctor { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index 7e25f6876ad..6bc18254737 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -81,7 +81,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { gpuStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index da6690a96a1..526f9425992 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -82,7 +82,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index 874bd61d198..84b1e7148df 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -78,7 +78,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index 944644f4101..ec18a172e1f 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -158,7 +158,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index 063eb5c1f82..37b18703031 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -153,7 +153,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel { auto comm = platform::NCCLCommContext::Instance().Get(rid, place); if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx)->stream(); + stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 2e65e9f352d..89b703d8d1a 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -124,8 +124,7 @@ __global__ void ConvShiftDy(const T *x, } // namespace template -class ConvShiftKernel - : public framework::OpKernel { +class ConvShiftKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -146,8 +145,7 @@ class ConvShiftKernel dim3 grid_dim(num_x_blocks, batch_size); - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); ConvShiftForward<<>>( x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); @@ -155,8 +153,7 @@ class ConvShiftKernel }; template -class ConvShiftGradKernel - : public framework::OpKernel { +class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -174,9 +171,8 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto &device_ctx = - context.template device_context(); - phi::funcs::SetConstant zero; + auto &device_ctx = context.template device_context(); + phi::funcs::SetConstant zero; const int x_per_block = 256; int num_x_blocks = DivUp(x_width, x_per_block); @@ -212,9 +208,7 @@ class ConvShiftGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - conv_shift, - ops::ConvShiftKernel); -REGISTER_OP_CUDA_KERNEL( - conv_shift_grad, - ops::ConvShiftGradKernel); +REGISTER_OP_CUDA_KERNEL(conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL(conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 746debe21e5..3205d5b3538 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -44,7 +44,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input"); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; if (use_cudnn) { library_ = framework::LibraryType::kCUDNN; @@ -348,7 +348,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif @@ -435,7 +435,7 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType( use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index e753b148fe1..3172625681a 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -132,7 +132,7 @@ void Compare2(f::Scope* scope, #ifdef PADDLE_WITH_CUDA TEST(copy_cross_scope, CUDA_fp32) { f::Scope scope; - p::CUDADeviceContext ctx(p::CUDAPlace(0)); + phi::GPUContext ctx(p::CUDAPlace(0)); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(p::CUDAPlace(0), ctx.stream()) .get()); @@ -142,7 +142,7 @@ TEST(copy_cross_scope, CUDA_fp32) { TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { f::Scope scope; - p::CUDADeviceContext ctx(p::CUDAPlace(0)); + phi::GPUContext ctx(p::CUDAPlace(0)); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(p::CUDAPlace(0), ctx.stream()) .get()); diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index d632de3ac86..434506c033c 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -176,7 +176,7 @@ __global__ void correlation_forward(T *output, } } -// class CorrelationKernel +// class CorrelationKernel template class CorrelationCUDAKernel : public framework::OpKernel { public: @@ -197,7 +197,7 @@ class CorrelationCUDAKernel : public framework::OpKernel { auto *output = ctx.Output("Output"); output->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // base on input1, NCHW auto in_dims = input1->dims(); @@ -209,11 +209,11 @@ class CorrelationCUDAKernel : public framework::OpKernel { int padded_input_height = H + 2 * pad_size; int padded_input_width = W + 2 * pad_size; - Tensor rinput1 = ctx.AllocateTmpTensor( + Tensor rinput1 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput1.mutable_data(ctx.GetPlace()); - Tensor rinput2 = ctx.AllocateTmpTensor( + Tensor rinput2 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput2.mutable_data(ctx.GetPlace()); @@ -468,7 +468,7 @@ class CorrelationCUDAGradKernel : public framework::OpKernel { grad_input1->mutable_data(ctx.GetPlace()); auto *grad_input2 = ctx.Output(framework::GradVarName("Input2")); grad_input2->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto in_dims = input1->dims(); int N = in_dims[0]; @@ -479,11 +479,11 @@ class CorrelationCUDAGradKernel : public framework::OpKernel { int padded_input_height = H + 2 * pad_size; int padded_input_width = W + 2 * pad_size; - Tensor rinput1 = ctx.AllocateTmpTensor( + Tensor rinput1 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput1.mutable_data(ctx.GetPlace()); - Tensor rinput2 = ctx.AllocateTmpTensor( + Tensor rinput2 = ctx.AllocateTmpTensor( {N, padded_input_height, padded_input_width, C}, dev_ctx); rinput2.mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu index 3d144ca29d9..5599a9b19b0 100644 --- a/paddle/fluid/operators/cos_sim_op.cu +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -14,8 +14,6 @@ limitations under the License. */ #include "paddle/fluid/operators/cos_sim_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cos_sim, ops::CosSimKernel); -REGISTER_OP_CUDA_KERNEL( - cos_sim_grad, - ops::CosSimGradKernel); +REGISTER_OP_CUDA_KERNEL(cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL(cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index bdc1f61fbe0..41e9d673d3f 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -230,11 +230,9 @@ REGISTER_OP_CPU_KERNEL(crop_grad, ops::CropGradKernel, ops::CropGradKernel); -REGISTER_OP_CUDA_KERNEL( - crop, - ops::CropKernel, - ops::CropKernel); -REGISTER_OP_CUDA_KERNEL( - crop_grad, - ops::CropGradKernel, - ops::CropGradKernel); +REGISTER_OP_CUDA_KERNEL(crop, + ops::CropKernel, + ops::CropKernel); +REGISTER_OP_CUDA_KERNEL(crop_grad, + ops::CropGradKernel, + ops::CropGradKernel); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu index cabe21919a9..2557532a940 100644 --- a/paddle/fluid/operators/cross_entropy_op.cu +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ namespace plat = paddle::platform; namespace ops = paddle::operators; -using CUDACtx = paddle::platform::CUDADeviceContext; +using CUDACtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, ops::CrossEntropyOpKernel, diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index 25d2c4e77d1..2095b3d3858 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -154,10 +154,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { if (host_out_lod0.back() == 0) { output->Resize({1, 1}); output->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(ctx.template device_context(), - output, - -1); + phi::funcs::SetConstant set_constant; + set_constant( + ctx.template device_context(), output, -1); } } } diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index bf3009e1fe2..d53333d2176 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -247,7 +247,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { SequenceLength = operators::GetDataFromTensor(sequence_length); } - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); int seq_length = x->dims()[0]; @@ -262,9 +262,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { int weight_numel; bool w_initialized = false; auto place = ctx.GetPlace(); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()) + .stream(); if (is_test && ctx.HasInput("W")) { auto *W = ctx.Input("W"); w_initialized = W->IsInitialized() ? true : false; @@ -460,7 +460,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto weight_grad_list = ctx.MultiOutput( framework::GradVarName("WeightList")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto input_dims = input->dims(); @@ -479,9 +479,9 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { bool continuous = is_continuous>(weight_list); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()) + .stream(); Tensor weight_whole; T *weight_data = nullptr; @@ -494,7 +494,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { } Tensor weight_grad; - phi::funcs::SetConstant zero; + phi::funcs::SetConstant zero; weight_grad.mutable_data({weight_numel}, ctx.GetPlace()); zero(dev_ctx, &weight_grad, static_cast(0.0)); T *weight_grad_data = weight_grad.data(); diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index a5c3b51d300..d08d9e14ef0 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -99,8 +99,7 @@ class CVMCUDAKernel : public framework::OpKernel { T* y_data = y->mutable_data(context.GetPlace()); // for Input X do not have Lod Information. - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); if (x->NumLevels() == 0) { CvmComputeKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, @@ -147,8 +146,7 @@ class CVMGradCUDAKernel : public framework::OpKernel { auto item_size = dx_numel / batch_size; // for Input X do not have Lod Information. - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); if (dx->NumLevels() == 0) { CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index f87c88b2aaf..e3f510e755b 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -104,8 +104,7 @@ __global__ void KernelUpdateParam(int C, } template -class DataNormKernel - : public framework::OpKernel { +class DataNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -130,8 +129,7 @@ class DataNormKernel T *scale_out_data = ctx.Output("Scales")->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); KernelMeanScale<<>>( C, @@ -146,8 +144,7 @@ class DataNormKernel }; template -class DataNormGradKernel - : public framework::OpKernel { +class DataNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -180,8 +177,7 @@ class DataNormGradKernel ctx.Output(framework::GradVarName("BatchSquareSum")) ->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); if (d_x != nullptr) { KernelDataNormBP<< } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - data_norm, - ops::DataNormKernel, - ops::DataNormKernel); -REGISTER_OP_CUDA_KERNEL( - data_norm_grad, - ops::DataNormGradKernel, - ops::DataNormGradKernel); +REGISTER_OP_CUDA_KERNEL(data_norm, + ops::DataNormKernel, + ops::DataNormKernel); +REGISTER_OP_CUDA_KERNEL(data_norm_grad, + ops::DataNormGradKernel, + ops::DataNormGradKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 18a38a0471d..d974a60197d 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -624,7 +624,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling, ops::DeformablePSROIPoolCUDAKernel, ops::DeformablePSROIPoolCUDAKernel); diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cu b/paddle/fluid/operators/dequantize_abs_max_op.cu index 964f740a03f..57d2c02adb0 100644 --- a/paddle/fluid/operators/dequantize_abs_max_op.cu +++ b/paddle/fluid/operators/dequantize_abs_max_op.cu @@ -27,8 +27,8 @@ __global__ void KeDequantize( } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, float max_range, @@ -46,14 +46,14 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_abs_max, ops::DequantizeMaxAbsKernel, ops::DequantizeMaxAbsKernel); diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index dc4e03a858f..2c47d9b17aa 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -36,8 +36,8 @@ __global__ void KeDequantize(const T* in, } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* dict, framework::Tensor* out) { @@ -54,11 +54,11 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; +template struct DequantizeFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_log, ops::DequantizeLogKernel); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 82678d456c3..4c729a65f59 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -39,8 +39,7 @@ struct StridedMemcpyFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(dev_ctx); + auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy( gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream()); #else @@ -66,8 +65,7 @@ struct StridedMemcpyFunctor { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(dev_ctx); + auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu index 036a33cff8e..30250eb8cc0 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cu +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -96,8 +96,7 @@ class AnchorGeneratorOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (box_num + block - 1) / block; - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); anchors->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index e41f4e9b3b7..90be767e2f2 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -47,14 +47,14 @@ struct RangeInitFunctor { }; template -static void SortDescending(const platform::CUDADeviceContext &ctx, +static void SortDescending(const phi::GPUContext &ctx, const Tensor &value, Tensor *value_out, Tensor *index_out) { int num = static_cast(value.numel()); Tensor index_in_t; int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); - platform::ForRange for_range(ctx, num); + platform::ForRange for_range(ctx, num); for_range(RangeInitFunctor{0, 1, idx_in}); int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); @@ -287,7 +287,7 @@ static __global__ void NMSKernel(const int n_boxes, } template -static void NMS(const platform::CUDADeviceContext &ctx, +static void NMS(const phi::GPUContext &ctx, const Tensor &proposals, const Tensor &sorted_indices, const T nms_threshold, diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index 084faf32e6b..87dc4a30abb 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -75,7 +75,6 @@ class GPUBoxClipKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - box_clip, - ops::GPUBoxClipKernel, - ops::GPUBoxClipKernel); +REGISTER_OP_CUDA_KERNEL(box_clip, + ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu index 7f66cb86b56..f87a636bdfb 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -152,7 +152,5 @@ class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( box_decoder_and_assign, - ops::BoxDecoderAndAssignCUDAKernel, - ops::BoxDecoderAndAssignCUDAKernel); + ops::BoxDecoderAndAssignCUDAKernel, + ops::BoxDecoderAndAssignCUDAKernel); diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 01346c94fa6..0fbc54d3135 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -138,8 +138,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor index_in_t; int* idx_in = index_in_t.mutable_data({total_roi_num}, dev_ctx.GetPlace()); - platform::ForRange for_range_total( - dev_ctx, total_roi_num); + platform::ForRange for_range_total(dev_ctx, total_roi_num); for_range_total(RangeInitFunctor{0, 1, idx_in}); Tensor keys_out_t; @@ -188,8 +187,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor batch_index_t; int* batch_idx_in = batch_index_t.mutable_data({real_post_num}, dev_ctx.GetPlace()); - platform::ForRange for_range_post( - dev_ctx, real_post_num); + platform::ForRange for_range_post(dev_ctx, real_post_num); for_range_post(RangeInitFunctor{0, 1, batch_idx_in}); Tensor out_id_t; @@ -228,7 +226,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { Tensor length_lod; int* length_lod_data = length_lod.mutable_data({lod_size}, dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &length_lod, static_cast(0)); int blocks = NumBlocks(real_post_num); @@ -274,7 +272,5 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( collect_fpn_proposals, - ops::GPUCollectFpnProposalsOpKernel, - ops::GPUCollectFpnProposalsOpKernel); + ops::GPUCollectFpnProposalsOpKernel, + ops::GPUCollectFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 8521b28127b..aa60d054546 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -164,8 +164,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); GenDensityPriorBox<<>>(feature_height, feature_width, img_height, diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 3fd309aee40..1063382ef33 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -129,7 +129,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { Tensor sub_lod_list; sub_lod_list.Resize({num_level, lod_size}); int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, &sub_lod_list, static_cast(0)); Tensor target_lvls; @@ -155,7 +155,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { Tensor index_in_t; int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); - platform::ForRange for_range(dev_ctx, roi_num); + platform::ForRange for_range(dev_ctx, roi_num); for_range(RangeInitFunctor{0, 1, idx_in}); Tensor keys_out_t; @@ -258,7 +258,5 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( distribute_fpn_proposals, - ops::GPUDistributeFpnProposalsOpKernel, - ops::GPUDistributeFpnProposalsOpKernel); + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 00ffeebc08b..ed1ad6da34d 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor; namespace { template static std::pair ProposalForOneImage( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const Tensor &im_info, const Tensor &anchors, const Tensor &variances, @@ -59,7 +59,7 @@ static std::pair ProposalForOneImage( proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); { - platform::ForRange for_range(ctx, pre_nms_num); + platform::ForRange for_range(ctx, pre_nms_num); for_range(BoxDecodeAndClipFunctor{anchors.data(), bbox_deltas.data(), variances.data(), @@ -94,7 +94,7 @@ static std::pair ProposalForOneImage( Tensor scores_filter, proposals_filter; // Handle the case when there is no keep index left if (keep_num == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); scores_filter.mutable_data({1, 1}, ctx.GetPlace()); set_zero(ctx, &proposals_filter, static_cast(0)); @@ -266,5 +266,4 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( generate_proposals, - ops::CUDAGenerateProposalsKernel); + ops::CUDAGenerateProposalsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index d5005f435f2..682a9adf659 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor; namespace { template static std::pair ProposalForOneImage( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const Tensor &im_shape, const Tensor &anchors, const Tensor &variances, @@ -60,7 +60,7 @@ static std::pair ProposalForOneImage( proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); { - platform::ForRange for_range(ctx, pre_nms_num); + platform::ForRange for_range(ctx, pre_nms_num); for_range(BoxDecodeAndClipFunctor{anchors.data(), bbox_deltas.data(), variances.data(), @@ -98,7 +98,7 @@ static std::pair ProposalForOneImage( Tensor scores_filter, proposals_filter; // Handle the case when there is no keep index left if (keep_num == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); scores_filter.mutable_data({1, 1}, ctx.GetPlace()); set_zero(ctx, &proposals_filter, static_cast(0)); @@ -274,5 +274,4 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( generate_proposals_v2, - ops::CUDAGenerateProposalsV2Kernel); + ops::CUDAGenerateProposalsV2Kernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu index 8342b4138c8..dc27f326538 100644 --- a/paddle/fluid/operators/detection/iou_similarity_op.cu +++ b/paddle/fluid/operators/detection/iou_similarity_op.cu @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/iou_similarity_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - iou_similarity, - ops::IOUSimilarityKernel, - ops::IOUSimilarityKernel); +REGISTER_OP_CUDA_KERNEL(iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu index 18088067147..1cdf7691338 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cu +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -149,8 +149,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (box_num + block - 1) / block; - auto stream = - ctx.template device_context().stream(); + auto stream = ctx.template device_context().stream(); boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 5bf68c154c6..73b28f8f0e4 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -382,7 +382,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { T* out2in_w_data = out2in_w->mutable_data({out->numel(), 4}, ctx.GetPlace()); - phi::funcs::SetConstant init; + phi::funcs::SetConstant init; init(ctx.cuda_device_context(), out2in_idx, static_cast(-1)); auto transformed_height = ctx.Attr("transformed_height"); @@ -519,7 +519,7 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel { T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); const T* out_grad_data = out_grad->data(); diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index 9ca480ba727..3def90fd459 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -185,12 +185,9 @@ class GPUSigmoidFocalLossGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sigmoid_focal_loss, - ops::GPUSigmoidFocalLossKernel, - ops::GPUSigmoidFocalLossKernel); + ops::GPUSigmoidFocalLossKernel, + ops::GPUSigmoidFocalLossKernel); REGISTER_OP_CUDA_KERNEL( sigmoid_focal_loss_grad, - ops::GPUSigmoidFocalLossGradKernel, - ops::GPUSigmoidFocalLossGradKernel); + ops::GPUSigmoidFocalLossGradKernel, + ops::GPUSigmoidFocalLossGradKernel); diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu index 1ab698998c7..337f55a3ca8 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cu +++ b/paddle/fluid/operators/detection/target_assign_op.cu @@ -41,8 +41,8 @@ __global__ void NegTargetAssignKernel(const int* neg_indices, } template -struct NegTargetAssignFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct NegTargetAssignFunctor { + void operator()(const phi::GPUContext& ctx, const int* neg_indices, const size_t* lod, const int N, @@ -58,16 +58,13 @@ struct NegTargetAssignFunctor { } }; -template struct NegTargetAssignFunctor; -template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - target_assign, - ops::TargetAssignKernel, - ops::TargetAssignKernel); +REGISTER_OP_CUDA_KERNEL(target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu index e7f564b7ab4..9926d0e5436 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu @@ -15,6 +15,5 @@ limitations under the License. */ #include "paddle/fluid/operators/dgc_clip_by_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc_clip_by_norm, - ops::DGCClipByNormKernel); +REGISTER_OP_CUDA_KERNEL(dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu index 0f0bf441a70..e8aa9b5245d 100644 --- a/paddle/fluid/operators/dgc_op.cu +++ b/paddle/fluid/operators/dgc_op.cu @@ -16,5 +16,4 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc, ops::DGCOpKernel); +REGISTER_OP_CUDA_KERNEL(dgc, ops::DGCOpKernel); diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu index c40206b0032..c9afc983b03 100644 --- a/paddle/fluid/operators/diag_op.cu +++ b/paddle/fluid/operators/diag_op.cu @@ -16,9 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - diag, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel); +REGISTER_OP_CUDA_KERNEL(diag, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel); diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc index 3fa1b6ef08c..92e5d66776d 100644 --- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc @@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope, auto dims = phi::make_ddim(shape); tensor->Resize(dims); platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); inference::tensorrt::RandomizeTensor(tensor, place, ctx); } @@ -127,7 +127,7 @@ TEST(DlnneEngineOp, manual) { framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); // Prepare variables. CreateCUDATensor(&scope, "x", std::vector({2, 4})); CreateCUDATensor(&scope, "y", std::vector({4, 6})); @@ -145,7 +145,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 7733d202e57..d51c57d6eab 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -98,7 +98,7 @@ TEST(Dropout, CPUDense) { TEST(Dropout, GPUDense) { f::Scope scope; p::CUDAPlace place; - p::CUDADeviceContext ctx(place); + p::phi::GPUContext ctx(place); Compare(scope, ctx); } */ diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 369fea2b0b1..681f91ffa68 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -85,9 +85,8 @@ class EditDistanceGPUKernel : public framework::OpKernel { auto batch_size = x1_t->dims()[0]; auto normalized = ctx.Attr("normalized"); - auto stream = reinterpret_cast( - ctx.device_context()) - .stream(); + auto stream = + reinterpret_cast(ctx.device_context()).stream(); framework::Vector hyp_lod(batch_size + 1); framework::Vector ref_lod(batch_size + 1); @@ -124,8 +123,8 @@ class EditDistanceGPUKernel : public framework::OpKernel { } const size_t num_strs = hyp_lod.size() - 1; - phi::funcs::SetConstant set_constant; - set_constant(ctx.template device_context(), + phi::funcs::SetConstant set_constant; + set_constant(ctx.template device_context(), sequence_num, static_cast(num_strs)); diff --git a/paddle/fluid/operators/eigvalsh_op.cu b/paddle/fluid/operators/eigvalsh_op.cu index 3ed431f8002..880570d1be0 100644 --- a/paddle/fluid/operators/eigvalsh_op.cu +++ b/paddle/fluid/operators/eigvalsh_op.cu @@ -16,25 +16,23 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigvalsh, - ops::EigvalshKernel, - ops::EigvalshKernel, - ops::EigvalshKernel>, - ops::EigvalshKernel>); +REGISTER_OP_CUDA_KERNEL(eigvalsh, + ops::EigvalshKernel, + ops::EigvalshKernel, + ops::EigvalshKernel>, + ops::EigvalshKernel>); REGISTER_OP_CUDA_KERNEL( eigvalsh_grad, - ops::EigvalshGradKernel, - ops:: - EigvalshGradKernel, - ops::EigvalshGradKernel, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel>, - ops::EigvalshGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index b0b0db5cde4..f81b76aa487 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -1596,7 +1596,7 @@ static inline std::vector GetReduceDim(const framework::DDim &in, #if defined(__NVCC__) || defined(__HIPCC__) template -void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, +void GetGradXAndYOut(const phi::GPUContext &dev_ctx, const platform::Place &place, int axis, std::vector ins, @@ -1609,7 +1609,7 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, } template -void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, +void GetGradXOrYOut(const phi::GPUContext &dev_ctx, const platform::Place &place, int axis, std::vector ins, diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 6f1e04ebfa6..3d32c9b8a14 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -158,17 +158,15 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); +REGISTER_OP_CUDA_KERNEL(expand_as, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel); +REGISTER_OP_CUDA_KERNEL(expand_as_grad, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel); #endif diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index d8c66f95a13..1261b777701 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -294,19 +294,17 @@ REGISTER_OP_CPU_KERNEL(expand_grad, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( expand, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel); + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); REGISTER_OP_CUDA_KERNEL( expand_grad, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel); + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel); #endif diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu index 5562baca97f..34855fbc96e 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu +++ b/paddle/fluid/operators/fake_dequantize_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.cu.h" namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; using float16 = paddle::platform::float16; REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsKernel, diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h index 65dfad185c1..161b87ea392 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cu.h +++ b/paddle/fluid/operators/fake_dequantize_op.cu.h @@ -31,8 +31,8 @@ __global__ void KeDequantize( } template -struct DequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, T max_range, @@ -102,8 +102,8 @@ __global__ void DequantizeTwoScale(const T* in, } template -struct ChannelDequantizeFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct ChannelDequantizeFunctor { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, @@ -163,10 +163,10 @@ struct ChannelDequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; -template struct ChannelDequantizeFunctor; -template struct ChannelDequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index c7ad664b7da..a19369fc6f2 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_quantize_op.cu.h" namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; using float16 = paddle::platform::float16; REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel, diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index 3b1877f2bc8..22ba8254cdc 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -72,8 +72,8 @@ __global__ void FindAbsMaxKernel(const T *in, const int n, T *out) { } template -struct FindAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const T *in, const int num, T *out) { @@ -90,9 +90,8 @@ struct FindAbsMaxFunctor { } }; -template struct FindAbsMaxFunctor; -template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; template __global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in, @@ -164,8 +163,8 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1( } template -struct FindChannelAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindChannelAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in_tensor, const int quant_axis, T *out_abs_max) { @@ -215,7 +214,7 @@ struct FindChannelAbsMaxFunctor { } }; -template struct FindChannelAbsMaxFunctor; +template struct FindChannelAbsMaxFunctor; template __global__ void ClipAndQuantKernel(const T *in, @@ -289,8 +288,8 @@ __global__ void ClipAndQuantDequantKernel(const T *in, } template -struct ClipAndFakeQuantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ClipAndFakeQuantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -309,11 +308,11 @@ struct ClipAndFakeQuantFunctor { } }; -template struct ClipAndFakeQuantFunctor; +template struct ClipAndFakeQuantFunctor; template -struct ClipAndFakeQuantDequantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ClipAndFakeQuantDequantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -408,8 +407,8 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in, } template -struct ChannelClipAndFakeQuantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ChannelClipAndFakeQuantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -462,8 +461,7 @@ struct ChannelClipAndFakeQuantFunctor { } }; -template struct ChannelClipAndFakeQuantFunctor; +template struct ChannelClipAndFakeQuantFunctor; template __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale, @@ -491,8 +489,8 @@ __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale, } template -struct FindRangeAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindRangeAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &cur_scale, const framework::Tensor &last_scale, const framework::Tensor &iter, @@ -535,7 +533,7 @@ struct FindRangeAbsMaxFunctor { sizeof(int), ctx.stream()); ctx.Wait(); - FindAbsMaxFunctor()( + FindAbsMaxFunctor()( ctx, scale_arr, len, out_scale_data); } } @@ -556,11 +554,11 @@ __global__ void FindMovingAverageAbsMaxKernel(const T *in_state, *out_scale = accum / state; } -template struct FindRangeAbsMaxFunctor; +template struct FindRangeAbsMaxFunctor; template -struct FindMovingAverageAbsMaxFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct FindMovingAverageAbsMaxFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in_accum, const framework::Tensor &in_state, const T *cur_scale, @@ -660,8 +658,8 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in, } template -struct ChannelClipFakeQuantDequantFunctor { - void operator()(const platform::CUDADeviceContext &ctx, +struct ChannelClipFakeQuantDequantFunctor { + void operator()(const phi::GPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -712,8 +710,7 @@ struct ChannelClipFakeQuantDequantFunctor { } }; -template struct ChannelClipFakeQuantDequantFunctor; +template struct ChannelClipFakeQuantDequantFunctor; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fc_op.cu.cc b/paddle/fluid/operators/fc_op.cu.cc index 4147903551d..35c55135f21 100644 --- a/paddle/fluid/operators/fc_op.cu.cc +++ b/paddle/fluid/operators/fc_op.cu.cc @@ -15,8 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fc, - ops::FCOpKernel, - ops::FCOpKernel, - ops::FCOpKernel); +REGISTER_OP_CUDA_KERNEL(fc, + ops::FCOpKernel, + ops::FCOpKernel, + ops::FCOpKernel); diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 93fb678e211..43776e98a02 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -42,7 +42,7 @@ void GetLinearOp(const std::vector &x, const std::vector &y, const framework::DDim &x_dim, const framework::DDim &y_dim, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, bool transpose_a, bool transpose_b, float alpha, @@ -87,7 +87,7 @@ void GetElementwiseAddOp(const std::vector &x, const std::vector &y, const int bsz_seq, const int output_size, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, std::vector *out) { framework::Scope scope; auto var_x = scope.Var("X"); @@ -128,7 +128,7 @@ void GetLinearOpGrad(const std::vector &x_vec, const framework::DDim &x_dim, const framework::DDim &y_dim, const framework::DDim &out_dim, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, bool transpose_a, bool transpose_b, float alpha, @@ -218,7 +218,7 @@ template void GetElementwiseAddOpGrad(const std::vector &dout_vec, const int bsz_seq, const int output_size, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, std::vector *dy_vec) { framework::Scope scope; auto var_x = scope.Var("X"); @@ -308,7 +308,7 @@ class TestFeedForward { bsz_seq_ = batch_size_ * seq_len_; output_size_ = 3 * num_head_ * dim_head_; input_size_ = dim_embed_; - ctx_ = new platform::CUDADeviceContext(place_); + ctx_ = new phi::GPUContext(place_); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place_, ctx_->stream()) .get()); @@ -559,7 +559,7 @@ class TestFeedForward { std::vector base_dinput_vec_, base_dweight_vec_, base_dbias_vec_; platform::CUDAPlace place_; - platform::CUDADeviceContext *ctx_; + phi::GPUContext *ctx_; }; // test for fp32, fp16, fp32+bias and fp16+bias diff --git a/paddle/fluid/operators/fill_any_op.cu.cc b/paddle/fluid/operators/fill_any_op.cu.cc index ca1726508c4..2a561e6d350 100644 --- a/paddle/fluid/operators/fill_any_op.cu.cc +++ b/paddle/fluid/operators/fill_any_op.cu.cc @@ -17,20 +17,18 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fill_any, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel); + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel); REGISTER_OP_CUDA_KERNEL( fill_any_grad, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel); + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 8e51c203d41..bd8303fe402 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -133,9 +133,9 @@ class FillConstantKernel : public framework::OpKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor->mutable_data(ctx.GetPlace(), framework::TransToPhiDataType(data_type)); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(ctx.GetPlace()); - functor(reinterpret_cast(dev_ctx), + functor(reinterpret_cast(dev_ctx), tensor, static_cast(value)); #else diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu index 8e30e0833d6..105b207636c 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cu +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -18,7 +18,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_constant_kernel(const int64_t featuresize, diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu index a7c26caa8fb..1b6ab71386b 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu +++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu @@ -18,7 +18,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_diagonal_tensor_kernel(int64_t size, @@ -109,7 +108,7 @@ class FillDiagonalTensorCUDAKernel : public framework::OpKernel { auto size = out->numel(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); Tensor tensor_tmp; int64_t *memory_block_cu = @@ -175,8 +174,7 @@ class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel { auto size = dx->numel(); - auto &dev_ctx = - ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); Tensor tensor_tmp; int64_t *memory_block_cu = diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc index 91809b8cd11..fad1bba49f3 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc @@ -21,28 +21,24 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fill_zeros_like, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); REGISTER_OP_CUDA_KERNEL( fill_zeros_like2, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc index e287ce1515a..0a055c688ee 100644 --- a/paddle/fluid/operators/flatten_op.cu.cc +++ b/paddle/fluid/operators/flatten_op.cu.cc @@ -17,35 +17,31 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_CUDA_KERNEL( - flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); -REGISTER_OP_CUDA_KERNEL( - flatten2, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel); -REGISTER_OP_CUDA_KERNEL( - flatten2_grad, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel); +REGISTER_OP_CUDA_KERNEL(flatten, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_CUDA_KERNEL(flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_CUDA_KERNEL(flatten2, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_CUDA_KERNEL(flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu index 2d2b334b021..7728d57a276 100644 --- a/paddle/fluid/operators/fold_op.cu +++ b/paddle/fluid/operators/fold_op.cu @@ -16,12 +16,10 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fold, - ops::FoldOpKernel, - ops::FoldOpKernel); +REGISTER_OP_CUDA_KERNEL(fold, + ops::FoldOpKernel, + ops::FoldOpKernel); -REGISTER_OP_CUDA_KERNEL( - fold_grad, - ops::FoldGradOpKernel, - ops::FoldGradOpKernel); +REGISTER_OP_CUDA_KERNEL(fold_grad, + ops::FoldGradOpKernel, + ops::FoldGradOpKernel); diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu index a762054a1ea..d1931367307 100644 --- a/paddle/fluid/operators/fsp_op.cu +++ b/paddle/fluid/operators/fsp_op.cu @@ -18,8 +18,8 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(fsp, - ops::FSPOpKernel, - ops::FSPOpKernel); + ops::FSPOpKernel, + ops::FSPOpKernel); REGISTER_OP_CUDA_KERNEL(fsp_grad, - ops::FSPGradOpKernel, - ops::FSPGradOpKernel); + ops::FSPGradOpKernel, + ops::FSPGradOpKernel); diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h index 3db4992bd29..baed3ca7a1a 100644 --- a/paddle/fluid/operators/fused/attention_layer_norm.h +++ b/paddle/fluid/operators/fused/attention_layer_norm.h @@ -22,7 +22,7 @@ namespace operators { template class AttnLayerNorm { public: - AttnLayerNorm(const platform::CUDADeviceContext& dev_ctx, + AttnLayerNorm(const phi::GPUContext& dev_ctx, float epsilon, int64_t batch_size, int64_t feature_size) @@ -82,7 +82,7 @@ class AttnLayerNorm { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int64_t batch_size_; int64_t feature_size_; diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 6dd6cc28139..fa50d5b23bf 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -95,7 +95,7 @@ __global__ void BroadcastKernelBinary( // bias add forward impl for "[m, n] + [n] = [m, n]" template -void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, +void LaunchBiasAddFwKernel(const phi::GPUContext& ctx, int m, int n, const T* in0, @@ -302,7 +302,7 @@ __global__ void BiasAddBw1DReduceKernel(const ReduceParamType* temp_sum, } template -void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx, +void Launch2DColumnReduce(const phi::GPUContext& dev_ctx, const int max_threads, const int reduce_num, const int left_num, @@ -345,11 +345,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx, // input // and d_bias[n] as output. template -void LaunchBiasAddBwKernel(const platform::CUDADeviceContext& dev_ctx, - int m, - int n, - const T* d_out, - T* d_bias) { +void LaunchBiasAddBwKernel( + const phi::GPUContext& dev_ctx, int m, int n, const T* d_out, T* d_bias) { int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int reduce_num = m; int left_num = n; diff --git a/paddle/fluid/operators/fused/attn_feed_forward.h b/paddle/fluid/operators/fused/attn_feed_forward.h index 568c283d3e4..753eb447108 100644 --- a/paddle/fluid/operators/fused/attn_feed_forward.h +++ b/paddle/fluid/operators/fused/attn_feed_forward.h @@ -24,7 +24,7 @@ namespace operators { template class FeedForward { public: - FeedForward(const platform::CUDADeviceContext& dev_ctx, + FeedForward(const phi::GPUContext& dev_ctx, int bsz_seq, int output_size, int input_size, @@ -53,7 +53,7 @@ class FeedForward { // column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out) // here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out) - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.GEMM(transA, transB, bsz_seq_, @@ -78,7 +78,7 @@ class FeedForward { T* input, T* weight, T* d_output, T* d_input, T* d_weight, T* d_bias) { T alpha = static_cast(1.0); T beta = static_cast(0.0); - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); // column-major: gemm-nt, get d_weight. CBLAS_TRANSPOSE transA = CblasTrans; @@ -116,7 +116,7 @@ class FeedForward { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int bsz_seq_, output_size_, input_size_; bool compute_bias_; }; diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h index 9adfe8e088d..07947f522cd 100644 --- a/paddle/fluid/operators/fused/attn_gemm.h +++ b/paddle/fluid/operators/fused/attn_gemm.h @@ -30,7 +30,7 @@ template class AttnMatMul { public: // (m, n, k) = bsz_seq, output_size, input_size - AttnMatMul(const platform::CUDADeviceContext& dev_ctx, + AttnMatMul(const phi::GPUContext& dev_ctx, bool transA, bool transB, int bsz_seq, @@ -60,7 +60,7 @@ class AttnMatMul { T beta = static_cast(0.0); // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out) - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.GEMM(transA, transB, bsz_seq_, @@ -91,7 +91,7 @@ class AttnMatMul { T beta_dA = use_addto ? static_cast(1.0) : static_cast(0.0); T beta_dB = static_cast(0.0); - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); if (!transA_) { // forward: gemm-nt if (transB_) { @@ -223,7 +223,7 @@ class AttnMatMul { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; bool transA_; bool transB_; diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 121cbc909b8..81e8c573266 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -43,7 +43,7 @@ template class CUDNNConvFusionOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* bias = ctx.Input("Bias"); @@ -109,17 +109,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); + ctx.AllocateTmpTensor(new_input_shape, dev_ctx); const int rank = transformed_input_channel.dims().size(); T pad_value(0.0); switch (rank) { case 4: { - phi::funcs::PadFunction( + phi::funcs::PadFunction( dev_ctx, input_pad, transformed_input_channel, @@ -127,7 +125,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { &transformed_input); } break; case 5: { - phi::funcs::PadFunction( + phi::funcs::PadFunction( dev_ctx, input_pad, transformed_input_channel, diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 0bda60f6b8b..e11792a5dfb 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -149,7 +149,7 @@ void ComputeInplaceRelu(framework::Tensor *cpu_x) { } } -void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, +void ComputeBatchNormForward(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, @@ -216,7 +216,7 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, saved_reserve_space->ShareDataWith(*reserve_space); } -void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, +void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_z, const Tensor &cpu_scale, @@ -280,7 +280,7 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, saved_reserve_space->ShareDataWith(*reserve_space); } -void ComputeFusedBNAddReluBackward(const platform::CUDADeviceContext &ctx, +void ComputeFusedBNAddReluBackward(const phi::GPUContext &ctx, const Tensor &cpu_dy, const Tensor &cpu_x, const Tensor &cpu_scale, @@ -384,10 +384,8 @@ class CudnnBNAddReluTester { << ", is_relative_atol=" << is_relative_atol << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ << ", has_shortcut=" << has_shortcut_; - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; @@ -469,10 +467,8 @@ class CudnnBNAddReluTester { } void CheckBackward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_dx_base; framework::Tensor cpu_dz_base; @@ -526,7 +522,7 @@ class CudnnBNAddReluTester { {channels_}, static_cast(0.0f), cpu_saved_var); } - void BaselineForward(const platform::CUDADeviceContext &ctx, + void BaselineForward(const phi::GPUContext &ctx, Tensor *cpu_mean_x, Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, @@ -573,7 +569,7 @@ class CudnnBNAddReluTester { } } - void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + void BaselineForwardFusedBNAddRelu(const phi::GPUContext &ctx, Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, @@ -594,7 +590,7 @@ class CudnnBNAddReluTester { saved_reserve_space); } - void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + void BaselineBackwardFusedBNAddRelu(const phi::GPUContext &ctx, Tensor *cpu_dx, Tensor *cpu_dz, Tensor *cpu_dscale, @@ -614,7 +610,7 @@ class CudnnBNAddReluTester { cpu_dbias); } - void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + void ComputeFusedBNStatsFinalize(const phi::GPUContext &ctx, const Tensor &cpu_x, const Tensor &cpu_bn_scale, const Tensor &cpu_bn_bias, @@ -671,7 +667,7 @@ class CudnnBNAddReluTester { } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedForward(const platform::CUDADeviceContext &ctx, + void FusedForward(const phi::GPUContext &ctx, Tensor *cpu_mean_x, Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, @@ -809,7 +805,7 @@ class CudnnBNAddReluTester { } // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedBackward(const platform::CUDADeviceContext &ctx, + void FusedBackward(const phi::GPUContext &ctx, Tensor *cpu_dx, Tensor *cpu_dz, Tensor *cpu_dscale, diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index 719c2fe64e5..628642b9563 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -61,7 +61,7 @@ struct BNStatsFinalizeArgs { template class CudnnBNStatsFinalize { public: - CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx, + CudnnBNStatsFinalize(const phi::GPUContext &ctx, const std::vector ¶m_shape) : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING), inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) { @@ -69,7 +69,7 @@ class CudnnBNStatsFinalize { } ~CudnnBNStatsFinalize() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &sum, const Tensor &sum_of_squares, const Tensor &scale, @@ -130,7 +130,7 @@ class CudnnBNStatsFinalize { } private: - void TrainInit(const platform::CUDADeviceContext &ctx) { + void TrainInit(const phi::GPUContext &ctx) { // Set constant_param for train op train_op_.SetOpConstParamAttr({CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER, @@ -167,7 +167,7 @@ class CudnnBNStatsFinalize { &workspace_size_bytes); } - void InferenceInit(const platform::CUDADeviceContext &ctx) { + void InferenceInit(const phi::GPUContext &ctx) { // Set constant_param for inference op inference_op_.SetOpConstParamAttr({CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 3c924ddd9d9..34cf677223c 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -37,7 +37,7 @@ struct NormConvolutionArgs { compute_type = platform::CudnnDataType::type; } - void Set(const platform::CUDADeviceContext &ctx, + void Set(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -124,7 +124,7 @@ struct NormConvolutionArgs { conv_desc.set(dtype, paddings, strides, dilations, false, group); } - bool IsSupport(const platform::CUDADeviceContext &ctx, + bool IsSupport(const phi::GPUContext &ctx, const std::vector &filter_shape, int stride, int dilation, @@ -167,7 +167,7 @@ struct NormConvolutionArgs { template class CudnnNormConvolution { public: - CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + CudnnNormConvolution(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -186,7 +186,7 @@ class CudnnNormConvolution { } ~CudnnNormConvolution() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &input, const Tensor &filter, Tensor *output, @@ -228,7 +228,7 @@ class CudnnNormConvolution { } private: - CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + CudnnFusionOp *GetForwardOp(const phi::GPUContext &ctx) { framework::AlgorithmsCache &cache = *(CudnnFusionOpCache::Instance().GetForward()); @@ -284,7 +284,7 @@ class CudnnNormConvolution { template class CudnnNormConvolutionGrad { public: - CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + CudnnNormConvolutionGrad(const phi::GPUContext &ctx, const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, @@ -304,7 +304,7 @@ class CudnnNormConvolutionGrad { } ~CudnnNormConvolutionGrad() {} - void Backward(const platform::CUDADeviceContext &ctx, + void Backward(const phi::GPUContext &ctx, const Tensor &input, const Tensor &filter, const Tensor &output_grad, @@ -327,7 +327,7 @@ class CudnnNormConvolutionGrad { } private: - void BackwardFilter(const platform::CUDADeviceContext &ctx, + void BackwardFilter(const phi::GPUContext &ctx, T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) { @@ -355,7 +355,7 @@ class CudnnNormConvolutionGrad { workspace_size); } - void BackwardData(const platform::CUDADeviceContext &ctx, + void BackwardData(const phi::GPUContext &ctx, T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, @@ -387,7 +387,7 @@ class CudnnNormConvolutionGrad { workspace_size); } - CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + CudnnFusionOp *GetBackwardFilterOp(const phi::GPUContext &ctx) { framework::AlgorithmsCache &cache = *(CudnnFusionOpCache::Instance().GetBackward()); @@ -430,7 +430,7 @@ class CudnnNormConvolutionGrad { return wgrad_op; } - size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t GetWorkspaceSizeBwdData(const phi::GPUContext &ctx) { size_t workspace_size = 0U; auto handle = ctx.cudnn_handle(); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 7d404e6b3ed..ef93612ffce 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -94,7 +94,7 @@ void CheckOutput(const framework::Tensor &cpu_res, } // Use Paddle conv2d op results as baseline -void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, +void ComputeConv2DForward(const phi::GPUContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, Tensor *cpu_output, @@ -130,7 +130,7 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, } // Use Paddle conv2d_grad op results as baseline -void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, +void ComputeConv2DBackward(const phi::GPUContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_output_grad, @@ -242,10 +242,8 @@ class CudnnNormConvolutionTester { ~CudnnNormConvolutionTester() {} void CheckForward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; @@ -266,10 +264,8 @@ class CudnnNormConvolutionTester { } void CheckBackward(float diff, bool is_relative_atol = false) { - platform::CUDADeviceContext *ctx = - static_cast( - platform::DeviceContextPool::Instance().Get( - platform::CUDAPlace(0))); + phi::GPUContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); framework::Tensor cpu_input_grad_base; framework::Tensor cpu_filter_nchw_grad_base; @@ -304,7 +300,7 @@ class CudnnNormConvolutionTester { &cpu_output_grad_); } - void BaselineForward(const platform::CUDADeviceContext &ctx, + void BaselineForward(const phi::GPUContext &ctx, framework::Tensor *cpu_output_base, framework::Tensor *cpu_sum_base, framework::Tensor *cpu_sum_of_square_base) { @@ -314,7 +310,7 @@ class CudnnNormConvolutionTester { *cpu_output_base, cpu_sum_base, cpu_sum_of_square_base); } - void BaselineBackward(const platform::CUDADeviceContext &ctx, + void BaselineBackward(const phi::GPUContext &ctx, framework::Tensor *cpu_input_grad_base, framework::Tensor *cpu_filter_grad_base) { ComputeConv2DBackward(ctx, @@ -329,7 +325,7 @@ class CudnnNormConvolutionTester { } // get forward results of cudnn_norm_conv - void FusedForward(const platform::CUDADeviceContext &ctx, + void FusedForward(const phi::GPUContext &ctx, framework::Tensor *cpu_output, framework::Tensor *cpu_sum, framework::Tensor *cpu_sum_of_square) { @@ -367,7 +363,7 @@ class CudnnNormConvolutionTester { sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - void FusedBackward(const platform::CUDADeviceContext &ctx, + void FusedBackward(const phi::GPUContext &ctx, framework::Tensor *cpu_input_grad, framework::Tensor *cpu_filter_grad) { framework::Tensor input; @@ -443,7 +439,7 @@ TEST(CudnnNormConvFp16, K1S1) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -473,7 +469,7 @@ TEST(CudnnNormConvFp16, K3S1) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -503,7 +499,7 @@ TEST(CudnnNormConvFp16, K1S1O4) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() < 70) { @@ -533,7 +529,7 @@ TEST(CudnnNormConvFp16, K1S2O4) { output_channels, kernel_size, stride); - platform::CUDADeviceContext *ctx = static_cast( + phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); if (ctx->GetComputeCapability() <= 70) { diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index 61e513e911a..b25605c6ca0 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -100,7 +100,7 @@ struct ScaleBiasAddReluArgs { template class CudnnScaleBiasAddRelu { public: - CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, + CudnnScaleBiasAddRelu(const phi::GPUContext &ctx, const std::string &act_type, bool fuse_add, bool has_shortcut, @@ -116,7 +116,7 @@ class CudnnScaleBiasAddRelu { ~CudnnScaleBiasAddRelu() {} - void Forward(const platform::CUDADeviceContext &ctx, + void Forward(const phi::GPUContext &ctx, const Tensor &x, const Tensor &x_scale, const Tensor &x_bias, @@ -171,7 +171,7 @@ class CudnnScaleBiasAddRelu { fwd_workspace_byte_); } - void Backward(const platform::CUDADeviceContext &ctx, + void Backward(const phi::GPUContext &ctx, const Tensor &dy, const Tensor &x, const Tensor &scale, @@ -237,7 +237,7 @@ class CudnnScaleBiasAddRelu { } private: - void ForwardInit(const platform::CUDADeviceContext &ctx) { + void ForwardInit(const phi::GPUContext &ctx) { // Set constant_param fwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, @@ -285,7 +285,7 @@ class CudnnScaleBiasAddRelu { CUDNN_BATCHNORM_SPATIAL_PERSISTENT); } - void BackwardInit(const platform::CUDADeviceContext &ctx) { + void BackwardInit(const phi::GPUContext &ctx) { // Set constant_param bwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER, diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index ef1befbb320..7de59dd9ee2 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -67,7 +67,7 @@ class AttnDropoutParam { template class FMHARef { public: - FMHARef(const platform::CUDADeviceContext& dev_ctx, + FMHARef(const phi::GPUContext& dev_ctx, int64_t batch_size, int64_t seq_len, int64_t num_head, @@ -146,7 +146,7 @@ class FMHARef { // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; CBLAS_TRANSPOSE transB = CblasTrans; - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; int gemm_n = out_seq_len; @@ -274,7 +274,7 @@ class FMHARef { Tensor* transpose_2_out_grad_tensor, Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) { - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; int k_size = q_size; int softmax_axis = -1; @@ -479,7 +479,7 @@ class FMHARef { } private: - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; int64_t batch_size_; int64_t seq_len_; diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 2c3fd75d8e0..ed904df93df 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -43,7 +43,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor &tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index c94aae0dd49..53984707d50 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -37,7 +37,7 @@ template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template -class FusedBatchNormActKernel +class FusedBatchNormActKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -88,7 +88,7 @@ class FusedBatchNormActKernel const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, // skip the batch norm calculation, let y = act(x). @@ -217,7 +217,7 @@ class FusedBatchNormActKernel }; template -class FusedBatchNormActGradKernel +class FusedBatchNormActGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -268,7 +268,7 @@ class FusedBatchNormActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { if (act_type == "relu") { auto x_v = framework::EigenVector::Flatten(*x); @@ -281,9 +281,7 @@ class FusedBatchNormActGradKernel PADDLE_THROW( platform::errors::Unimplemented("Unsupported activation type")); } - phi::funcs::SetConstant> - functor; + phi::funcs::SetConstant> functor; functor(dev_ctx, d_scale, static_cast>(0)); functor(dev_ctx, d_bias, static_cast>(0)); return; @@ -402,12 +400,12 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act, - ops::FusedBatchNormActKernel, - ops::FusedBatchNormActKernel, - ops::FusedBatchNormActKernel); + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel, + ops::FusedBatchNormActKernel); REGISTER_OP_CUDA_KERNEL( fused_batch_norm_act_grad, - ops::FusedBatchNormActGradKernel, - ops::FusedBatchNormActGradKernel, - ops::FusedBatchNormActGradKernel); + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel, + ops::FusedBatchNormActGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index e703ce810cd..23dbbe2ad08 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -36,7 +36,7 @@ template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template -class FusedBatchNormAddActKernel +class FusedBatchNormAddActKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -81,7 +81,7 @@ class FusedBatchNormAddActKernel const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // ------------------- cudnn descriptors --------------------- auto handle = dev_ctx.cudnn_handle(); @@ -194,7 +194,7 @@ class FusedBatchNormAddActKernel }; template -class FusedBatchNormAddActGradKernel +class FusedBatchNormAddActGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -243,7 +243,7 @@ class FusedBatchNormAddActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; @@ -353,9 +353,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_bn_add_activation, - ops::FusedBatchNormAddActKernel); + ops::FusedBatchNormAddActKernel); REGISTER_OP_CUDA_KERNEL( fused_bn_add_activation_grad, - ops::FusedBatchNormAddActGradKernel); + ops::FusedBatchNormAddActGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 5016cb65fb7..732da5fa52a 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -125,7 +125,7 @@ void LaunchDropoutActBias(Functor act_functor, const T *bias, T *dst, MaskType *mask_data, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { SetZero(ctx, dst, rows * cols); @@ -277,7 +277,7 @@ void LaunchDropoutActBiasGrad(Functor act_functor, const uint32_t cols, T *dx, T *dbias, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { const T zero = static_cast(0.0); auto factor = dropout_prob == static_cast(1.0f) ? zero diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 18f51b5d02b..06810c18cc0 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -57,7 +57,7 @@ struct TestFusedDropoutActBias { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; TestFusedDropoutActBias() { rows = 32; @@ -69,7 +69,7 @@ struct TestFusedDropoutActBias { has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } TestFusedDropoutActBias(int rows_, @@ -87,7 +87,7 @@ struct TestFusedDropoutActBias { has_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } ~TestFusedDropoutActBias() {} diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index faac7691ae2..0f37d242ebc 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -40,7 +40,7 @@ namespace operators { * 2D grids: gridDim.y = rows */ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids( - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, const uint32_t rows, const uint32_t cols, const int vec_size) { @@ -101,9 +101,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state, } template -inline void SetZero(const platform::CUDADeviceContext &ctx, - T *ptr, - const size_t size) { +inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) { PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream())); } diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index aa4c6622f70..208b2a58bca 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -100,8 +100,7 @@ struct DropoutParam { seed_val = context.Attr(pre_fix + "seed"); } - int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx, - const int offset) { + int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) { uint64_t tmp_increment; GetSeedDataAndIncrement( ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment); @@ -113,7 +112,7 @@ struct DropoutParam { template class FusedDropoutHelper { private: - int GetIncrement(const platform::CUDADeviceContext& ctx) { + int GetIncrement(const phi::GPUContext& ctx) { const int VecSize = MAX_CACHE_BYTES / sizeof(T); const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1; auto config = Get1DBlocksAnd2DGrids(ctx, @@ -130,7 +129,7 @@ class FusedDropoutHelper { public: FusedDropoutHelper() {} - FusedDropoutHelper(const platform::CUDADeviceContext& ctx, + FusedDropoutHelper(const phi::GPUContext& ctx, const int rows, const int cols, const DropoutParam& dropout_param) { @@ -140,7 +139,7 @@ class FusedDropoutHelper { } // out = residual + dropout( src + bias ) - void ResidualDropoutBias(const platform::CUDADeviceContext& ctx, + void ResidualDropoutBias(const phi::GPUContext& ctx, const T* src, const T* residual, const T* bias, @@ -162,7 +161,7 @@ class FusedDropoutHelper { ctx); } - void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + void ResidualDropoutBiasGrad(const phi::GPUContext& ctx, const T* d_out, const MaskType* mask, T* d_src, @@ -189,7 +188,7 @@ class FusedDropoutHelper { } // out = dropout(activation(src + bias)) - void DropoutActBias(const platform::CUDADeviceContext& ctx, + void DropoutActBias(const phi::GPUContext& ctx, const T* src, const T* bias, const std::string& act_method, @@ -234,7 +233,7 @@ class FusedDropoutHelper { } } - void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx, + void DropoutActBiasGrad(const phi::GPUContext& ctx, const T* dout, const T* src, const T* bias, @@ -297,7 +296,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { epsilon_ = epsilon; } - FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx, + FusedDropoutLayerNormHelper(const phi::GPUContext& ctx, const int rows, const int cols, const DropoutParam& dropout_param, @@ -308,7 +307,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } // call layer_norm - void LayerNorm(const platform::CUDADeviceContext& ctx, + void LayerNorm(const phi::GPUContext& ctx, const T* src, const LayerNormParamType* gamma, const LayerNormParamType* beta, @@ -324,7 +323,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } } - void LayerNormGrad(const platform::CUDADeviceContext& ctx, + void LayerNormGrad(const phi::GPUContext& ctx, const T* dout, const T* src, const LayerNormParamType* gamma, @@ -350,7 +349,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { // out = layernorm(residual + dropout(src + bias)) template , bool is_same_type = false> - void LayernormResidualDropoutBias(const platform::CUDADeviceContext& ctx, + void LayernormResidualDropoutBias(const phi::GPUContext& ctx, const T* src, const T* residual, const T* bias, @@ -392,7 +391,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper { } template , bool is_same_type = false> - void LayernormResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + void LayernormResidualDropoutBiasGrad(const phi::GPUContext& ctx, const T* d_out, const T* layernorm_src, const MaskType* mask, diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index e3ab187f0d7..8fac3165f1c 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -49,7 +49,7 @@ void Dropout(const std::vector &x, const framework::DDim &x_dim, std::vector *out, std::vector *mask, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, uint64_t seed, float dropout_prob, bool is_upscale_in_train, @@ -97,7 +97,7 @@ void DropoutGrad(std::vector *dx, const framework::DDim &x_dim, const std::vector &dout, const std::vector &mask, - const platform::CUDADeviceContext &ctx, + const phi::GPUContext &ctx, float dropout_prob, bool is_upscale_in_train) { framework::Scope scope; @@ -148,7 +148,7 @@ void LayerNorm(const std::vector> &scale, const float epsilon, const int rows, const int cols, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { framework::Scope scope; auto place = ctx.GetPlace(); paddle::optional scale_opt; diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu index 7b44aa82e4a..80b10021c09 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu @@ -17,36 +17,28 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_elemwise_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_add_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CUDA_KERNEL( fused_elemwise_add_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index d102c5e4705..abc9b451d17 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -121,5 +121,4 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_embedding_eltwise_layernorm, - ops::EmbeddingEltWiseLayerNormKernel); + ops::EmbeddingEltWiseLayerNormKernel); diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 37aa5cbd14d..3e117c45359 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -397,8 +397,8 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { const T* w_data = w->data(); T* out_data = out->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto& dev_ctx = ctx.template device_context(); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.GEMM(false, false, M, diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 4126f5ad726..60b5ecfdd74 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -36,7 +36,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor& tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext& ctx) { + const phi::GPUContext& ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); @@ -73,7 +73,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT template class FusedFeedForwardKernel : public framework::OpKernel { public: - void MatMul(const platform::CUDADeviceContext& ctx, + void MatMul(const phi::GPUContext& ctx, const framework::Tensor& a, const framework::Tensor& b, framework::Tensor* c) const { @@ -86,7 +86,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); } - void FFN(const platform::CUDADeviceContext& ctx, + void FFN(const phi::GPUContext& ctx, const framework::Tensor& x, const framework::Tensor& linear1_weight, const framework::Tensor* linear1_bias, @@ -309,7 +309,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { template class FusedFeedForwardGradKernel : public framework::OpKernel { public: - void MatMulGrad(const platform::CUDADeviceContext& ctx, + void MatMulGrad(const phi::GPUContext& ctx, const framework::Tensor& d_out, const framework::Tensor& a, const framework::Tensor& b, @@ -327,7 +327,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); } - void FFNGrad(const platform::CUDADeviceContext& ctx, + void FFNGrad(const phi::GPUContext& ctx, const framework::Tensor& d_out, const framework::Tensor& x, const framework::Tensor& dropout1_mask, @@ -630,14 +630,12 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_feedforward, - ops::FusedFeedForwardKernel, - ops::FusedFeedForwardKernel, - ops::FusedFeedForwardKernel); + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel); REGISTER_OP_CUDA_KERNEL( fused_feedforward_grad, - ops::FusedFeedForwardGradKernel, - ops::FusedFeedForwardGradKernel, - ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel); diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 45d47908b99..f9d9fad110e 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -44,7 +44,7 @@ inline std::string MemoryDebugString(const Tensor& t) { } template -void AllocWithDebugInfo(const platform::CUDADeviceContext& dev_ctx, +void AllocWithDebugInfo(const phi::GPUContext& dev_ctx, const std::string& info, Tensor* t) { t->mutable_data(dev_ctx.GetPlace()); @@ -59,7 +59,7 @@ struct TernaryAddFunctor { template struct GateAttentionConfig { public: - const platform::CUDADeviceContext& dev_ctx; + const phi::GPUContext& dev_ctx; bool merge_qkv; bool has_gating; @@ -86,7 +86,7 @@ struct GateAttentionConfig { phi::DDim qktv_out_dims; phi::DDim gate_out_dims; - GateAttentionConfig(const platform::CUDADeviceContext& dev_ctx, + GateAttentionConfig(const phi::GPUContext& dev_ctx, const Tensor* query, const Tensor* key, const Tensor* query_weight, @@ -249,7 +249,7 @@ struct GateAttentionConfig { template struct GateAttentionGradConfig : public GateAttentionConfig { public: - GateAttentionGradConfig(const platform::CUDADeviceContext& dev_ctx, + GateAttentionGradConfig(const phi::GPUContext& dev_ctx, const Tensor* query, const Tensor* key, const Tensor* query_weight, @@ -322,7 +322,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig { template class FMHAGateRef { public: - FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv) + FMHAGateRef(const phi::GPUContext& dev_ctx, bool merge_qkv) : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {} void ComputeForward(const Tensor* nonbatched_bias, @@ -748,7 +748,7 @@ class FMHAGateRef { int64_t stride_a = m * k; int64_t stride_b = k * n; - auto blas = phi::funcs::GetBlas(dev_ctx_); + auto blas = phi::funcs::GetBlas(dev_ctx_); blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, m, @@ -764,7 +764,7 @@ class FMHAGateRef { stride_b); } - const platform::CUDADeviceContext& dev_ctx_; + const phi::GPUContext& dev_ctx_; bool merge_qkv_; }; diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 7400246f407..139a365c10e 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -350,7 +350,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { const bool merge_qkv = ctx.Attr("merge_qkv"); const bool has_gating = ctx.Attr("has_gating"); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); AllocWithDebugInfo(dev_ctx, "softmax_out", softmax_out); AllocWithDebugInfo(dev_ctx, "fmha_out", fmha_out); if (has_gating) { @@ -441,7 +441,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel { bool has_gating = ctx.Attr("has_gating"); bool merge_qkv = ctx.Attr("merge_qkv"); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); AllocWithDebugInfo(dev_ctx, "query_grad", query_grad); GateAttentionGradConfig config( diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 3ebb9f9e640..219a517315b 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -29,7 +29,7 @@ template class FusedGemmEpilogueKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); @@ -320,7 +320,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { template static void ComputeImpl(const framework::ExecutionContext& ctx) { using Trait = FusedGEMMGradTrait; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const Tensor* dout = ctx.Input("DOut"); const Tensor* x = ctx.Input("X"); const Tensor* y = ctx.Input("Y"); @@ -677,17 +677,14 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue, - ops::FusedGemmEpilogueKernel, - ops::FusedGemmEpilogueKernel, - ops::FusedGemmEpilogueKernel); + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue_grad, - ops::FusedGemmEpilogueGradKernel, - ops::FusedGemmEpilogueGradKernel, - ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); #endif diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 301b62524a5..7bb3498567c 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -688,7 +688,7 @@ void LaunchLayernormResidualDropoutBias( T *layernorm_dst, LayerNormParamType *mean, LayerNormParamType *var, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { auto cuda_place = ctx.GetPlace(); @@ -846,7 +846,7 @@ template void LaunchLayernormResidualDropoutGrad( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const uint32_t rows, const uint32_t cols, const float epsilon, diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index 4f8ceba177e..d3c6cca95ef 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -54,7 +54,7 @@ struct TestFusedLayernormResidualDropoutBias { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; TestFusedLayernormResidualDropoutBias() { rows = 32; @@ -69,7 +69,7 @@ struct TestFusedLayernormResidualDropoutBias { epsilon = 0.00001f; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } TestFusedLayernormResidualDropoutBias(int _rows, @@ -92,7 +92,7 @@ struct TestFusedLayernormResidualDropoutBias { has_layernorm_bias = true; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto devicectx = pool.Get(place); - ctx = reinterpret_cast(devicectx); + ctx = reinterpret_cast(devicectx); } ~TestFusedLayernormResidualDropoutBias() {} diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index a8bebd5012d..a858b31e23c 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -49,7 +49,7 @@ using Tensor = framework::Tensor; template static void AllReduce(framework::Tensor &tensor, // NOLINT const int ring_id, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); @@ -996,7 +996,7 @@ void fmha_launch_kernel(const Masked_multihead_attention_params ¶ms, } template -void fmha(const platform::CUDADeviceContext &dev_ctx, +void fmha(const phi::GPUContext &dev_ctx, const Tensor &qkv_tensor, const Tensor &qkv_bias_tensor, const Tensor &src_mask_tensor, @@ -1118,7 +1118,7 @@ __global__ void write_cache_v_kernel(T *cache_v, } template -void write_cache_kv(const platform::CUDADeviceContext &dev_ctx, +void write_cache_kv(const phi::GPUContext &dev_ctx, T *cache_k, T *cache_v, const T *k, diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index f9bf4c3c5a3..c1131cae5d8 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -178,7 +178,7 @@ void LaunchResidualDropoutBias(const uint32_t rows, const T *bias, MaskType *mask_data, T *dst, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { // dropout_prob == 1.0f if (std::abs(dropout_prob - 1.0f) < 1e-5) { if (residual == dst) return; @@ -323,7 +323,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout, const uint32_t cols, T *dx, T *dbias, - const platform::CUDADeviceContext &ctx) { + const phi::GPUContext &ctx) { const T zero = static_cast(0.0f); auto factor = dropout_prob == static_cast(1.0f) ? zero diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 2ff0d3dc036..ba0652339e9 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -58,7 +58,7 @@ struct FusedResidualDropoutBiasTester { std::vector correct_mask; platform::CUDAPlace place; - platform::CUDADeviceContext *ctx; + phi::GPUContext *ctx; FusedResidualDropoutBiasTester() { rows = 32; @@ -69,7 +69,7 @@ struct FusedResidualDropoutBiasTester { is_test = false; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto device_ctx = pool.Get(place); - ctx = reinterpret_cast(device_ctx); + ctx = reinterpret_cast(device_ctx); } FusedResidualDropoutBiasTester(int rows, @@ -86,7 +86,7 @@ struct FusedResidualDropoutBiasTester { is_test(is_test) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto device_ctx = pool.Get(place); - ctx = reinterpret_cast(device_ctx); + ctx = reinterpret_cast(device_ctx); } void SetUp() { diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index 6aba49ea33f..a6a49b7ac62 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -114,9 +114,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext const float padding_value, const bool use_cvm, const int cvm_offset) { - auto stream = - ctx.template device_context().stream(); - auto &dev_ctx = ctx.template device_context(); + auto stream = ctx.template device_context().stream(); + auto &dev_ctx = ctx.template device_context(); size_t total_ptr_len = input_data.size() + output_data.size() + seqpool_output_data.size() + lods.size(); auto temp_ptr = @@ -320,9 +319,8 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx, const int embedding_size, const bool use_cvm, const int cvm_offset) { - auto stream = - ctx.template device_context().stream(); - auto &dev_ctx = ctx.template device_context(); + auto stream = ctx.template device_context().stream(); + auto &dev_ctx = ctx.template device_context(); size_t total_ptr_len = out_grads_data.size() + in_grads_data.size() + cvm_data.size() + lods.size(); auto temp_ptr = diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 22da713f87d..ce892024d8d 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -39,7 +39,7 @@ template class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto* input = ctx.Input("Input"); auto filters = ctx.MultiInput("Filter"); auto bias = ctx.MultiInput("Bias"); diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc index 9a81a50efba..9ce8842a015 100644 --- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc @@ -18,8 +18,7 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - fusion_group, - ops::FusionGroupKernel, - ops::FusionGroupKernel, - ops::FusionGroupKernel); +REGISTER_OP_CUDA_KERNEL(fusion_group, + ops::FusionGroupKernel, + ops::FusionGroupKernel, + ops::FusionGroupKernel); diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc index 89058cc3fd9..9a1e58c6320 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc @@ -52,7 +52,7 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel { platform::dynload::cudnnCreateTensorDescriptor(&out_desc)); cudnnDataType_t cudnn_dtype = CudnnDataType::type; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); T* odata = out->data(); diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index 4fd51aec24a..8a6d5b313ad 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -237,7 +237,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { auto *temp_out_data = temp_out_tensor.mutable_data(context.GetPlace()); // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) - auto blas = phi::funcs::GetBlas(device_ctx); + auto blas = phi::funcs::GetBlas(device_ctx); blas.MatMul(input_matrix, w_matrix, &temp_out_tensor); // temp_out_tensor.Resize(temp_out_dims); @@ -285,6 +285,5 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - multihead_matmul, - ops::MultiHeadMatMulV2Kernel); +REGISTER_OP_CUDA_KERNEL(multihead_matmul, + ops::MultiHeadMatMulV2Kernel); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu index 188c2b21be0..d0a8788e0db 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cu +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -90,7 +90,7 @@ class ResNetUnitKernel : public framework::OpKernel { output_channel; auto place = ctx.GetPlace(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // 1. Conv Tensor sum_x; @@ -268,7 +268,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { auto bitmask_shape = phi::vectorize(bitmask->dims()); auto place = ctx.GetPlace(); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, // scale_x_grad, bias_x_grad diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu index 117228d2864..1f9640dd4ba 100644 --- a/paddle/fluid/operators/fused/skip_layernorm_op.cu +++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu @@ -69,6 +69,5 @@ class SkipLayerNormKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - skip_layernorm, - ops::SkipLayerNormKernel); +REGISTER_OP_CUDA_KERNEL(skip_layernorm, + ops::SkipLayerNormKernel); diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu index d20ffa274a8..b82b9a931a1 100644 --- a/paddle/fluid/operators/fused/yolo_box_head_op.cu +++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu @@ -72,8 +72,7 @@ class YoloBoxHeadKernel : public framework::OpKernel { auto* out = context.Output("Out"); auto anchors = context.Attr>("anchors"); auto class_num = context.Attr("class_num"); - auto& device_ctx = - context.template device_context(); + auto& device_ctx = context.template device_context(); auto x_dims = x->dims(); const int batch_size = x_dims[0]; const int h = x_dims[2]; diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu index 6ebf9b8eb31..c259d0efb49 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_op.cu @@ -587,9 +587,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_softmax_mask, - ops::SoftmaxMaskFuseKernel, - ops::SoftmaxMaskFuseKernel); + ops::SoftmaxMaskFuseKernel, + ops::SoftmaxMaskFuseKernel); REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_grad, - ops::SoftmaxMaskFuseGradKernel, - ops::SoftmaxMaskFuseGradKernel); + ops::SoftmaxMaskFuseGradKernel, + ops::SoftmaxMaskFuseGradKernel); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 1849108ed66..54db576d317 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -577,12 +577,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle, - ops::SoftmaxMaskFuseUpperTriangleKernel, - ops::SoftmaxMaskFuseUpperTriangleKernel); + ops::SoftmaxMaskFuseUpperTriangleKernel, + ops::SoftmaxMaskFuseUpperTriangleKernel); REGISTER_OP_CUDA_KERNEL( fused_softmax_mask_upper_triangle_grad, - ops::SoftmaxMaskFuseUpperTriangleGradKernel, - ops::SoftmaxMaskFuseUpperTriangleGradKernel); + ops::SoftmaxMaskFuseUpperTriangleGradKernel, + ops::SoftmaxMaskFuseUpperTriangleGradKernel); diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index a7b64223be7..fa28481f4c4 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -143,8 +143,7 @@ struct gpu_gather_scatter_functor { int block = 512; int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); GatherScatterGPUKernel <<>>(self_data, dim, @@ -257,8 +256,7 @@ void gpu_scatter_input_grad_kernel(Tensor self, int block = 512; int64_t n = inner_dim_size * select_dim_size * outer_dim_size; int64_t grid = (n + block - 1) / block; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); ScatterInputGradGPUKernel <<>>(grad_data, diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 1e89091b202..81b53c8b949 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -61,8 +61,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::DefaultCUDAGenerator(device_id); - auto& dev_cxt = - context.template device_context(); + auto& dev_cxt = context.template device_context(); if (seed == 0) { // use global Generator seed diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index 06720f1db11..fc8f195fb70 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -245,8 +245,7 @@ void SampleNeighbors(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(0, k, bs, @@ -305,8 +304,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, num_input, len_hashtable, @@ -319,8 +317,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, thrust::raw_pointer_cast(item_count.data()), num_input, @@ -338,8 +335,7 @@ void FillHashTable(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(input, num_input, len_hashtable, @@ -398,8 +394,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(thrust::raw_pointer_cast(outputs->data()), outputs->size(), size, @@ -411,8 +406,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx, <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()), bs, thrust::raw_pointer_cast(reindex_nodes->data()), @@ -625,8 +619,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel { <<( - ctx.device_context()) + reinterpret_cast(ctx.device_context()) .stream()>>>( unique_dst_size, thrust::raw_pointer_cast(unique_dst_merge_reindex.data()), @@ -650,7 +643,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(graph_khop_sampler, diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 48872cb8caa..da9ccdf627f 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -41,7 +41,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { true, platform::errors::InvalidArgument( "It must use CUDAPlace when using CUDA Kernel")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* input = ctx.Input("X"); auto* grid = ctx.Input("Grid"); @@ -90,7 +90,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { true, platform::errors::InvalidArgument( "It must use CUDAPlace when using CUDA Kernel")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); auto* input = ctx.Input("X"); auto* grid = ctx.Input("Grid"); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index abf367f70e2..668f69b4c75 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -261,8 +261,7 @@ __global__ void GroupNormForward(const T* x, } template -class GroupNormKernel - : public framework::OpKernel { +class GroupNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const std::string data_layout_str = ctx.Attr("data_layout"); @@ -291,8 +290,8 @@ class GroupNormKernel y->mutable_data(ctx.GetPlace()); mean->mutable_data(ctx.GetPlace()); var->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); Tensor temp_var; temp_var.mutable_data(var->dims(), ctx.GetPlace()); auto* x_data = x->data(); @@ -597,8 +596,7 @@ __global__ void GetXGradientCUDAKernel(int imsize, } template -class GroupNormGradKernel - : public framework::OpKernel { +class GroupNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const std::string data_layout_str = ctx.Attr("data_layout"); @@ -629,8 +627,8 @@ class GroupNormGradKernel : x_dims[x_dims.size() - 2]); d_x->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); + phi::funcs::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); Tensor ds, db; ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); @@ -816,11 +814,9 @@ class GroupNormGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - group_norm, - ops::GroupNormKernel, - ops::GroupNormKernel); -REGISTER_OP_CUDA_KERNEL( - group_norm_grad, - ops::GroupNormGradKernel, - ops::GroupNormGradKernel); +REGISTER_OP_CUDA_KERNEL(group_norm, + ops::GroupNormKernel, + ops::GroupNormKernel); +REGISTER_OP_CUDA_KERNEL(group_norm_grad, + ops::GroupNormGradKernel, + ops::GroupNormGradKernel); diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index 37ba915a24f..f3665da1816 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -131,11 +131,9 @@ class GRUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gru, - ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_CUDA_KERNEL( - gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CUDA_KERNEL(gru, + ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL(gru_grad, + ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu index 979a20a64ee..adaaf1d09cd 100644 --- a/paddle/fluid/operators/gru_unit_op.cu +++ b/paddle/fluid/operators/gru_unit_op.cu @@ -14,11 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/gru_unit_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_CUDA_KERNEL( - gru_unit_grad, - ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CUDA_KERNEL(gru_unit, + ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL(gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 835312851b2..0d1006658a4 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -155,9 +155,7 @@ REGISTER_OP_CPU_KERNEL(hinge_loss, REGISTER_OP_CPU_KERNEL(hinge_loss_grad, ops::HingeLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_CUDA_KERNEL( - hinge_loss_grad, - ops::HingeLossGradKernel); +REGISTER_OP_CUDA_KERNEL(hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL(hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 503b64c3431..b58f9a55756 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -200,9 +200,7 @@ REGISTER_OP_CPU_KERNEL(im2sequence, REGISTER_OP_CPU_KERNEL(im2sequence_grad, ops::Im2SequenceGradKernel); -REGISTER_OP_CUDA_KERNEL( - im2sequence, - ops::Im2SequenceKernel); -REGISTER_OP_CUDA_KERNEL( - im2sequence_grad, - ops::Im2SequenceGradKernel); +REGISTER_OP_CUDA_KERNEL(im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CUDA_KERNEL(im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index a63cd8b0071..044b8118abb 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -225,16 +225,14 @@ namespace plat = paddle::platform; #ifdef PADDLE_WITH_HIP // MIOPEN do not support double REGISTER_OP_CUDA_KERNEL(inplace_abn, - ops::InplaceABNKernel); -REGISTER_OP_CUDA_KERNEL( - inplace_abn_grad, - ops::InplaceABNGradKernel); + ops::InplaceABNKernel); +REGISTER_OP_CUDA_KERNEL(inplace_abn_grad, + ops::InplaceABNGradKernel); #else REGISTER_OP_CUDA_KERNEL(inplace_abn, - ops::InplaceABNKernel, - ops::InplaceABNKernel); -REGISTER_OP_CUDA_KERNEL( - inplace_abn_grad, - ops::InplaceABNGradKernel, - ops::InplaceABNGradKernel); + ops::InplaceABNKernel, + ops::InplaceABNKernel); +REGISTER_OP_CUDA_KERNEL(inplace_abn_grad, + ops::InplaceABNGradKernel, + ops::InplaceABNGradKernel); #endif diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 45b2a05211e..80534d29b5a 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -1337,8 +1337,8 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_w == out_w) { @@ -1432,8 +1432,8 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_h == out_h && in_w == out_w) { @@ -1581,8 +1581,8 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, dim_grad = {n, in_d, in_h, in_w, c}; } auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_d == out_d && in_h == out_h && in_w == out_w) { diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu index 88447aa830f..d8e18f58fa9 100644 --- a/paddle/fluid/operators/isfinite_op.cu +++ b/paddle/fluid/operators/isfinite_op.cu @@ -17,44 +17,23 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(isinf, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isinf, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); -REGISTER_OP_CUDA_KERNEL(isnan, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isnan, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); -REGISTER_OP_CUDA_KERNEL(isfinite, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CUDA_KERNEL( + isfinite, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index c7bf0d538bd..093a33d89b0 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -97,8 +97,6 @@ REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel); REGISTER_OP_CPU_KERNEL(l1_norm_grad, ops::L1NormGradKernel); -REGISTER_OP_CUDA_KERNEL( - l1_norm, ops::L1NormKernel); -REGISTER_OP_CUDA_KERNEL( - l1_norm_grad, - ops::L1NormGradKernel); +REGISTER_OP_CUDA_KERNEL(l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL(l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 01abe645495..d14cc076261 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -61,8 +61,7 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel { auto n_expert = expert_count->numel() / n_worker; const auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); dim3 grid_dim(256); dim3 block_dim(1024); diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h index 240f6b06325..008305bdb93 100644 --- a/paddle/fluid/operators/lite/lite_engine_op.h +++ b/paddle/fluid/operators/lite/lite_engine_op.h @@ -85,7 +85,7 @@ class LiteEngineOp : public framework::OperatorBase { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(dev_place)) { platform::GpuStreamSync( - static_cast(ctx)->stream()); + static_cast(ctx)->stream()); } #endif VLOG(3) << "lite engine run"; @@ -103,7 +103,7 @@ class LiteEngineOp : public framework::OperatorBase { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(dev_place)) { platform::GpuStreamSync( - static_cast(ctx)->stream()); + static_cast(ctx)->stream()); } #endif } diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index fed71abe166..d631c3c7317 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -72,7 +72,7 @@ TEST(LiteEngineOp, engine_op) { framework::Scope scope; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h index 574b7cbec28..b8892e9c88f 100644 --- a/paddle/fluid/operators/lite/ut_helper.h +++ b/paddle/fluid/operators/lite/ut_helper.h @@ -58,7 +58,7 @@ void serialize_params(std::string* str, std::ostringstream os; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); #else phi::CPUContext ctx; #endif diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu index 2a42c0daa7f..9405b3564b9 100644 --- a/paddle/fluid/operators/load_combine_op.cu +++ b/paddle/fluid/operators/load_combine_op.cu @@ -16,10 +16,9 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); +REGISTER_OP_CUDA_KERNEL(load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu index c122978d12c..04c456ac603 100644 --- a/paddle/fluid/operators/load_op.cu +++ b/paddle/fluid/operators/load_op.cu @@ -16,10 +16,9 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - load, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel); +REGISTER_OP_CUDA_KERNEL(load, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu index a910ad549f1..25aad4c4afc 100644 --- a/paddle/fluid/operators/lod_reset_op.cu +++ b/paddle/fluid/operators/lod_reset_op.cu @@ -16,15 +16,13 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel, - ops::LoDResetKernel); -REGISTER_OP_CUDA_KERNEL( - lod_reset_grad, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel, - ops::LoDResetGradKernel); +REGISTER_OP_CUDA_KERNEL(lod_reset, + ops::LoDResetKernel, + ops::LoDResetKernel, + ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL(lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index d4b36f31e62..ab4d95c592f 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor Apply(static_cast(dev_ctx)); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - Apply(static_cast(dev_ctx)); + Apply(static_cast(dev_ctx)); #else PADDLE_THROW( platform::errors::Unavailable("Paddle is not compiled with CUDA.")); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 41740923b42..073077f6586 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -151,8 +151,7 @@ template class LookupTableGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index ef9bd7865d6..7b4ed84fc20 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -142,8 +142,7 @@ struct LookupTableV2GradCUDAFunctor { template void apply() { - auto &dev_ctx = - context_.template device_context(); + auto &dev_ctx = context_.template device_context(); bool is_sparse = context_.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu index c736dfb48a6..8c95cf1d0c9 100644 --- a/paddle/fluid/operators/lrn_op.cu +++ b/paddle/fluid/operators/lrn_op.cu @@ -97,7 +97,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha, data_layout); @@ -108,7 +108,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, } template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, @@ -138,8 +138,8 @@ struct LRNFunctor { } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template __global__ void KeCMRNormDiff(int img_size, @@ -218,7 +218,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); KeCMRNormDiff <<>>(img_size, x, @@ -236,7 +236,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, } template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -268,13 +268,11 @@ struct LRNGradFunctor { } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lrn, ops::LRNKernel); -REGISTER_OP_CUDA_KERNEL( - lrn_grad, ops::LRNGradKernel); +REGISTER_OP_CUDA_KERNEL(lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL(lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc index 60364ef4486..13a0ded14b4 100644 --- a/paddle/fluid/operators/lstm_op.cu.cc +++ b/paddle/fluid/operators/lstm_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstm, - ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CUDA_KERNEL( - lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CUDA_KERNEL(lstm, + ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL(lstm_grad, + ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu index 11c2844ccc3..8614eaf5d49 100644 --- a/paddle/fluid/operators/lstmp_op.cu +++ b/paddle/fluid/operators/lstmp_op.cu @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/lstmp_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstmp, - ops::LSTMPKernel, - ops::LSTMPKernel); -REGISTER_OP_CUDA_KERNEL( - lstmp_grad, - ops::LSTMPGradKernel, - ops::LSTMPGradKernel); +REGISTER_OP_CUDA_KERNEL(lstmp, + ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CUDA_KERNEL(lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index f063716b200..e9d1a6a136a 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -38,10 +38,8 @@ class LstsqCUDAKernel : public framework::OpKernel { auto* solution = context.Output("Solution"); auto dito = - math::DeviceIndependenceTensorOperations(context); - auto& dev_ctx = - context.template device_context(); + math::DeviceIndependenceTensorOperations(context); + auto& dev_ctx = context.template device_context(); auto x_dims = x.dims(); auto y_dims = y.dims(); @@ -163,20 +161,19 @@ class LstsqCUDAKernel : public framework::OpKernel { }; template <> -void BatchedOrmqr( - const platform::CUDADeviceContext& dev_ctx, - bool left, - bool transpose, - int batch_size, - int m, - int n, - int k, - float* a, - int a_stride, - float* tau, - int tau_stride, - float* other, - int other_stride) { +void BatchedOrmqr(const phi::GPUContext& dev_ctx, + bool left, + bool transpose, + int batch_size, + int m, + int n, + int k, + float* a, + int a_stride, + float* tau, + int tau_stride, + float* other, + int other_stride) { int lwork = 0; auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -232,20 +229,19 @@ void BatchedOrmqr( } template <> -void BatchedOrmqr( - const platform::CUDADeviceContext& dev_ctx, - bool left, - bool transpose, - int batch_size, - int m, - int n, - int k, - double* a, - int a_stride, - double* tau, - int tau_stride, - double* other, - int other_stride) { +void BatchedOrmqr(const phi::GPUContext& dev_ctx, + bool left, + bool transpose, + int batch_size, + int m, + int n, + int k, + double* a, + int a_stride, + double* tau, + int tau_stride, + double* other, + int other_stride) { int lwork = 0; auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -305,9 +301,8 @@ void BatchedOrmqr( namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lstsq, - ops::LstsqCUDAKernel, - ops::LstsqCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstsq, + ops::LstsqCUDAKernel, + ops::LstsqCUDAKernel); #endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index b58142d793c..6d1ff9f296e 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -87,7 +87,7 @@ void GetClassInterval(const gpuStream_t& stream, const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); // use global calculate stream const auto calcu_stream = - static_cast( + static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -275,7 +275,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { const float scale = ctx.Attr("scale"); const auto& place = ctx.GetPlace(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLComm* comm; @@ -290,7 +290,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { comm = platform::NCCLCommContext::Instance().Get(rid, place); // use global calculate stream - stream = static_cast( + stream = static_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); } @@ -377,8 +377,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 2, obtain logit_max Tensor logits_max; - logits_max = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* logits_max_buff = logits_max.mutable_data(place); TensorReduceImpl>( dev_ctx, @@ -420,8 +419,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 4, sum(exp(logit - logit_max)) Tensor sum_exp_logits; - sum_exp_logits = - ctx.AllocateTmpTensor({N, 1}, dev_ctx); + sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); TensorReduceImpl>( dev_ctx, @@ -465,7 +463,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - // logit_max)))) // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) - phi::funcs::SetConstant()( + phi::funcs::SetConstant()( dev_ctx, loss, static_cast(0.0)); if (label_type == framework::proto::VarType::INT32) { typedef int32_t LabelT; @@ -543,8 +541,7 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { const float margin3 = context.Attr("margin3"); const float scale = context.Attr("scale"); - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); const auto sofrmax_dims = softmax->dims(); const int axis = sofrmax_dims.size() - 1; diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu index d7e77e92302..f672381ed7a 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cu +++ b/paddle/fluid/operators/margin_rank_loss_op.cu @@ -16,9 +16,7 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_CUDA_KERNEL( - margin_rank_loss_grad, - ops::MarginRankLossGradKernel); +REGISTER_OP_CUDA_KERNEL(margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL(margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu index 2c58b99396e..3b52788514b 100644 --- a/paddle/fluid/operators/marker_op.cu +++ b/paddle/fluid/operators/marker_op.cu @@ -33,7 +33,7 @@ template class MarkerOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto marker_role = ctx.Attr("marker_role"); auto marker_pos = ctx.Attr("marker_pos"); diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 2008e6b3fa2..80af6f673c4 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -403,9 +403,9 @@ static inline int GetNumUsedThreads(const int max_threads_per_seq, } template -class BeamSearchFunctor { +class BeamSearchFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::LoDTensor* pre_ids, const framework::LoDTensor* pre_scores, const framework::LoDTensor* ids, @@ -531,10 +531,10 @@ class BeamSearchFunctor { } }; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index f6b0349f1ca..87785bfdc85 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -144,15 +144,14 @@ void TestBeamSearch() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void TestBeamSearch() { +void TestBeamSearch() { paddle::framework::LoDTensor ids; paddle::framework::LoDTensor scores; paddle::framework::LoDTensor pre_ids; paddle::framework::LoDTensor pre_scores; auto* place = new paddle::platform::CUDAPlace(); - auto* context = new paddle::platform::CUDADeviceContext(*place); + auto* context = new phi::GPUContext(*place); context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -185,9 +184,7 @@ void TestBeamSearch - beamsearch; + paddle::operators::math::BeamSearchFunctor beamsearch; beamsearch(*context, &pre_ids, &pre_scores, @@ -235,7 +232,6 @@ TEST(BeamSearch, CPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(BeamSearch, GPU) { - TestBeamSearch(); + TestBeamSearch(); } #endif diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index b8c23cafe6d..42a54195def 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -532,7 +532,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(half2 *qk_buf_, } template -inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, +inline void MatMulWithHeadQK(const phi::GPUContext &context, int head_num, int seq_len, int size_per_head, @@ -549,8 +549,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, CBLAS_TRANSPOSE transB = !k_trans ? CblasNoTrans : CblasTrans; typedef typename CUDATypeTraits::TYPE run_type; - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); auto stream = context.stream(); blas.BatchedGEMM(transA, @@ -625,7 +624,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, } template -inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, +inline void MatMulWithHeadQKV(const phi::GPUContext &context, int head_num, int seq_len, int size_per_head, @@ -641,8 +640,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, int k = head_num * size_per_head; typedef typename CUDATypeTraits::TYPE run_type; - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); auto stream = context.stream(); CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans; @@ -663,17 +661,16 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, } template -void MultiHeadGPUComputeFunctor::operator()( - const platform::CUDADeviceContext &dev_ctx, - int batch, - int seq_len, - int head_num, - int head_size, - T *qkptr, - const T *bias_qk_ptr, - T *tptr, - T alpha, - T beta) { +void MultiHeadGPUComputeFunctor::operator()(const phi::GPUContext &dev_ctx, + int batch, + int seq_len, + int head_num, + int head_size, + T *qkptr, + const T *bias_qk_ptr, + T *tptr, + T alpha, + T beta) { auto stream = dev_ctx.stream(); const int tsize = batch * head_num * seq_len * head_size; diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index 55d3dd2c3e8..bc59e2fa1a3 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -93,7 +93,7 @@ class EmbEltwiseLayerNormFunctor { template class MultiHeadGPUComputeFunctor { public: - void operator()(const platform::CUDADeviceContext &dev_ctx, + void operator()(const phi::GPUContext &dev_ctx, int batch, int seq_len, int head_num, diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 298c2f4e5ef..11508fd2d1e 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -23,9 +23,9 @@ namespace math { * each dimension must be the same, except the axis dimension. */ template -class ConcatFunctor { +class ConcatFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::vector& input, int axis, framework::Tensor* output) { @@ -39,9 +39,9 @@ class ConcatFunctor { * each dimension must be the same, except the axis dimension. */ template -class SplitFunctor { +class SplitFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const std::vector& ref_inputs, int axis, @@ -51,9 +51,9 @@ class SplitFunctor { } }; -#define DEFINE_FUNCTOR(type) \ - template class ConcatFunctor; \ - template class SplitFunctor +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor FOR_ALL_TYPES(DEFINE_FUNCTOR); diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 4f0fee91e59..ccbe1c2aeed 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -469,24 +469,18 @@ void TestConcatMain() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void TestConcatMain() { - auto* context = - new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace()); +void TestConcatMain() { + auto* context = new phi::GPUContext(paddle::platform::CUDAPlace()); context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPlace(), context->stream()) .get()); context->PartialInitWithAllocator(); - ConcatCase1( - context); - ConcatCase2( - context); - ConcatCase3( - context); - ConcatCase4( - context); + ConcatCase1(context); + ConcatCase2(context); + ConcatCase3(context); + ConcatCase4(context); delete context; } @@ -495,7 +489,6 @@ void TestConcatMain(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - TestConcatMain(); + TestConcatMain(); #endif } diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu index f04b2d15349..70b3d67caf3 100644 --- a/paddle/fluid/operators/math/context_project.cu +++ b/paddle/fluid/operators/math/context_project.cu @@ -17,8 +17,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index 34aeabfac64..cbe76844519 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -50,8 +50,8 @@ __global__ void CosSimDyKernel(const T* x_norm, } template -struct CosSimDyFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct CosSimDyFunctor { + void operator()(const phi::GPUContext& ctx, const T* x_norm, const T* y_norm, const T* x, @@ -69,8 +69,8 @@ struct CosSimDyFunctor { } }; -template struct CosSimDyFunctor; -template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index a056341c3bf..61682a95c13 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -176,7 +176,7 @@ struct MatrixEighFunctor { // symmetric matrices on GPU, and uses the variable has_vectors // to control whether to return the eigenvectors. template -struct MatrixEighFunctor { +struct MatrixEighFunctor { public: void operator()(const framework::ExecutionContext &ctx, const Tensor &input, @@ -187,10 +187,9 @@ struct MatrixEighFunctor { using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto dito = - math::DeviceIndependenceTensorOperations(ctx); + math::DeviceIndependenceTensorOperations(ctx); Tensor input_trans; input_trans = dito.Transpose(input); auto *input_vector = input_trans.data(); @@ -324,34 +323,34 @@ struct MatrixEighFunctor { m(paddle::platform::complex, Che, cuComplex) \ m(paddle::platform::complex, Zhe, cuDoubleComplex) -#define EVDBUFFER_INSTANCE(T, C, CastType) \ - template <> \ - inline void MatrixEighFunctor::EvdBuffer( \ - cusolverDnHandle_t handle, \ - cusolverEigMode_t jobz, \ - cublasFillMode_t uplo, \ - int n, \ - const T *A, \ - int lda, \ - const ValueType *W, \ - int *lwork) const { \ - PADDLE_ENFORCE_GPU_SUCCESS( \ - platform::dynload::cusolverDn##C##evd_bufferSize( \ - handle, \ - jobz, \ - uplo, \ - n, \ - reinterpret_cast(A), \ - lda, \ - W, \ - lwork)); \ +#define EVDBUFFER_INSTANCE(T, C, CastType) \ + template <> \ + inline void MatrixEighFunctor::EvdBuffer( \ + cusolverDnHandle_t handle, \ + cusolverEigMode_t jobz, \ + cublasFillMode_t uplo, \ + int n, \ + const T *A, \ + int lda, \ + const ValueType *W, \ + int *lwork) const { \ + PADDLE_ENFORCE_GPU_SUCCESS( \ + platform::dynload::cusolverDn##C##evd_bufferSize( \ + handle, \ + jobz, \ + uplo, \ + n, \ + reinterpret_cast(A), \ + lda, \ + W, \ + lwork)); \ } FUNC_WITH_TYPES(EVDBUFFER_INSTANCE); #define EVD_INSTANCE(T, C, CastType) \ template <> \ - inline void MatrixEighFunctor::Evd( \ + inline void MatrixEighFunctor::Evd( \ cusolverDnHandle_t handle, \ cusolverEigMode_t jobz, \ cublasFillMode_t uplo, \ diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu index 44ce4f0d6d3..49aae2ebc1d 100644 --- a/paddle/fluid/operators/math/gru_compute.cu +++ b/paddle/fluid/operators/math/gru_compute.cu @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::CUDADeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -94,7 +94,7 @@ struct GRUUnitFunctor { threads = dim3(32, 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -180,8 +180,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::CUDADeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -230,7 +230,7 @@ struct GRUUnitGradFunctor { origin_mode); } - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, @@ -324,10 +324,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 93ee9d3a15b..09ec777ebb6 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -179,8 +179,7 @@ void testIm2col() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void testIm2col() { +void testIm2col() { paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; paddle::framework::Tensor output_cfo; @@ -222,7 +221,7 @@ void testIm2colSetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -240,12 +239,12 @@ void testIm2col im2col; paddle::operators::math::Im2ColFunctor< paddle::operators::math::ColFormat::kOCF, - paddle::platform::CUDADeviceContext, + phi::GPUContext, float> im2col_ocf; @@ -283,12 +282,12 @@ void testIm2col col2im; paddle::operators::math::Col2ImFunctor< paddle::operators::math::ColFormat::kOCF, - paddle::platform::CUDADeviceContext, + phi::GPUContext, float> col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -343,8 +342,7 @@ void testIm2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 716989a7869..f18053e297e 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -124,15 +124,14 @@ int UniqSampler(const Sampler& sampler, } template -void GPUSampleWithProb::operator()( - const platform::CUDADeviceContext& context, - const int seed, - const int dict_size, - const bool uniq, - const std::size_t num_samples, - const Tensor* L, - Tensor* S, - Tensor* P) { +void GPUSampleWithProb::operator()(const phi::GPUContext& context, + const int seed, + const int dict_size, + const bool uniq, + const std::size_t num_samples, + const Tensor* L, + Tensor* S, + Tensor* P) { // UNDERSTAND: dimension issues const auto lbl_dim = L->dims(); const int batch_size = lbl_dim[0]; diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index bb5c2ef9799..1e8fb983a94 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -111,7 +111,7 @@ class SampleWithProb { template class GPUSampleWithProb { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const int seed, const int dict_size, const bool uniq, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index f09578a0b1c..7fa9dc27db9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -25,8 +25,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::CUDADeviceContext& context, +struct SelectedRowsAdd { + void operator()(const phi::GPUContext& context, const phi::SelectedRows& input1, const phi::SelectedRows& input2, phi::SelectedRows* output) { @@ -109,8 +109,8 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; namespace { template @@ -210,8 +210,8 @@ template struct SelectedRowsAdd; template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::CUDADeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const phi::GPUContext& context, const phi::SelectedRows& input1, const int64_t input2_offset, phi::SelectedRows* input2) { @@ -259,12 +259,11 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; namespace { template @@ -588,14 +587,14 @@ __global__ void UpdateToTensorKernel(const T* selected_rows, } template -struct UpdateToTensor { - void operator()(const platform::CUDADeviceContext& context, +struct UpdateToTensor { + void operator()(const phi::GPUContext& context, const ScatterOps& op, const phi::SelectedRows& input1, framework::Tensor* input2) { // NOTE: Use SelectedRowsAddToTensor for better performance // no additional MergeAdd called. - MergeAdd merge_func; + MergeAdd merge_func; auto merged_in1 = merge_func(context, input1); auto in1_height = merged_in1.height(); diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc index 877c3c63aff..746a64ff58c 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc @@ -20,10 +20,9 @@ limitations under the License. */ TEST(selected_rows_functor, gpu_add) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant functor; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -62,9 +61,7 @@ TEST(selected_rows_functor, gpu_add) { // simply concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), gpu_place); - paddle::operators::math::SelectedRowsAdd - add_functor; + paddle::operators::math::SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -108,9 +105,8 @@ TEST(selected_rows_functor, gpu_add) { new paddle::framework::Tensor()}; tensor2->mutable_data(phi::make_ddim({height, row_numel}), gpu_place); - paddle::operators::math:: - SelectedRowsAddTensor - add_tensor_functor; + paddle::operators::math::SelectedRowsAddTensor + add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); paddle::framework::Tensor tensor2_cpu; @@ -137,10 +133,9 @@ TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add_to) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant functor; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -169,9 +164,8 @@ TEST(selected_rows_functor, gpu_add_to) { // simply concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), gpu_place); - paddle::operators::math:: - SelectedRowsAddTo - add_to_functor; + paddle::operators::math::SelectedRowsAddTo + add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -212,9 +206,8 @@ TEST(selected_rows_functor, gpu_add_to) { tensor1->mutable_data(phi::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - paddle::operators::math:: - SelectedRowsAddToTensor - add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); paddle::framework::Tensor tensor1_cpu; @@ -241,10 +234,9 @@ TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_merge_add) { paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CPUPlace cpu_place; - paddle::platform::CUDADeviceContext& ctx = - *reinterpret_cast( - paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); - phi::funcs::SetConstant set_const; + phi::GPUContext& ctx = *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + phi::funcs::SetConstant set_const; int64_t height = 10; int64_t row_numel = 8; @@ -269,9 +261,8 @@ TEST(selected_rows_functor, gpu_merge_add) { std::unique_ptr output{new phi::SelectedRows()}; output->set_height(height); - paddle::operators::math::scatter:: - MergeAdd - merge_add_functor; + paddle::operators::math::scatter::MergeAdd + merge_add_functor; std::vector inputs; inputs.push_back(selected_rows1.get()); diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index 06eca480ec6..84944270f45 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -116,17 +116,15 @@ TEST(Seq2BatchPadding, CPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(SequencePadding, CUDA) { auto place = paddle::platform::CUDAPlace(0); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePadding( - *context, lod1, 16); + TestSequencePadding(*context, lod1, 16); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePadding( - *context, lod2, 128); + TestSequencePadding(*context, lod2, 128); } #endif diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 9ee3b107bea..a5edb1db95c 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -189,9 +189,9 @@ __global__ void sequence_pool_kernel(Range_OP op, } template -class SequencePoolFunctor { +class SequencePoolFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::string pooltype, T pad_value, const framework::LoDTensor& input, @@ -408,9 +408,9 @@ __global__ void sequence_pool_grad_kernel(Range_OP op, } template -class SequencePoolGradFunctor { +class SequencePoolGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const std::string pooltype, const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, @@ -493,10 +493,10 @@ class SequencePoolGradFunctor { }; // sequence pooling -template class SequencePoolFunctor; -template class SequencePoolFunctor; -template class SequencePoolGradFunctor; -template class SequencePoolGradFunctor; +template class SequencePoolFunctor; +template class SequencePoolFunctor; +template class SequencePoolGradFunctor; +template class SequencePoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index 63d922b7ebb..9cff64f7560 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -132,17 +132,15 @@ TEST(SequencePoolingGrad, CPU_SUM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(SequencePoolingGrad, CUDA_SUM) { auto place = paddle::platform::CUDAPlace(0); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum( - *context, lod1, 128); + TestSequencePoolingSum(*context, lod1, 128); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum( - *context, lod2, 128); + TestSequencePoolingSum(*context, lod2, 128); } #endif diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu index fd501d5188d..3aceceac32d 100644 --- a/paddle/fluid/operators/math/tree2col.cu +++ b/paddle/fluid/operators/math/tree2col.cu @@ -51,9 +51,9 @@ __global__ void tree2col(const T* eta, } } template -class Tree2ColFunctor { +class Tree2ColFunctor { public: - void operator()(const paddle::platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& EdgeSet, const framework::Tensor& node_features, framework::Tensor* patch, @@ -63,7 +63,7 @@ class Tree2ColFunctor { auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto feature_dims = node_features.dims(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; Tensor EdgeSet_cpu; framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu); @@ -128,9 +128,9 @@ class Tree2ColFunctor { } }; template -class Col2TreeFunctor { +class Col2TreeFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& EdgeSet, const framework::Tensor& patch_grad, framework::Tensor* embedding_grad, @@ -140,7 +140,7 @@ class Col2TreeFunctor { auto cpu_place = platform::CPUPlace(); auto stream = context.stream(); auto output_dims = patch_grad.dims(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; Tensor EdgeSet_cpu; framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu); @@ -214,10 +214,10 @@ class Col2TreeFunctor { } }; -template class Tree2ColFunctor; -template class Tree2ColFunctor; -template class Col2TreeFunctor; -template class Col2TreeFunctor; +template class Tree2ColFunctor; +template class Tree2ColFunctor; +template class Col2TreeFunctor; +template class Col2TreeFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index 5a776433199..253f4cb0279 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -111,9 +111,9 @@ __global__ void KernelUnpool3dMaxGrad(const int nthreads, */ template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -148,9 +148,9 @@ class Unpool2dMaxFunctor { * All tensors are in NCHW format. */ template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -189,9 +189,9 @@ class Unpool2dMaxGradFunctor { }; template -class Unpool3dMaxFunctor { +class Unpool3dMaxFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -230,9 +230,9 @@ class Unpool3dMaxFunctor { * All tensors are in NCDHW format. */ template -class Unpool3dMaxGradFunctor { +class Unpool3dMaxGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -274,14 +274,14 @@ class Unpool3dMaxGradFunctor { } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxFunctor; -template class Unpool3dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxFunctor; +template class Unpool3dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index ec3926b95ee..c0c4ed5bb5d 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -132,15 +132,14 @@ void testVol2col() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -void testVol2col() { +void testVol2col() { paddle::framework::Tensor input; paddle::framework::Tensor input_tmp; paddle::framework::Tensor output; paddle::framework::Tensor output_tmp; auto* place = new paddle::platform::CUDAPlace(); - auto* context = new paddle::platform::CUDADeviceContext(*place); + auto* context = new phi::GPUContext(*place); context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(*place, context->stream()) .get()); @@ -202,9 +201,7 @@ void testVol2col - vol2col; + paddle::operators::math::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -230,9 +227,7 @@ void testVol2col - col2vol; + paddle::operators::math::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -256,7 +251,6 @@ void testVol2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - testVol2col(); + testVol2col(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index c79073861ab..ff7ab502e8e 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -1055,20 +1055,17 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( matmul, - ops::MatMulKernel, - ops::MatMulKernel, - ops::MatMulKernel); + ops::MatMulKernel, + ops::MatMulKernel, + ops::MatMulKernel); REGISTER_OP_CUDA_KERNEL( matmul_grad, - ops::MatMulGradKernel, - ops::MatMulGradKernel, - ops::MatMulGradKernel); -REGISTER_OP_CUDA_KERNEL( - matmul_grad_grad, - ops::MatMulDoubleGradKernel, - ops::MatMulDoubleGradKernel); + ops::MatMulGradKernel, + ops::MatMulGradKernel, + ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL(matmul_grad_grad, + ops::MatMulDoubleGradKernel, + ops::MatMulDoubleGradKernel); #endif REGISTER_OP_VERSION(matmul).AddCheckpoint( diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 1359bd62b49..08ab074718b 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -92,7 +92,7 @@ template class MeanIoUCUDAOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); auto& place = *dev_ctx.eigen_device(); // get input and output tensor auto* predictions = ctx.Input("Predictions"); diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 8cd84f4b59e..b0513b0af84 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -40,8 +40,7 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto stream = - static_cast(&dev_ctx_)->stream(); + auto stream = static_cast(&dev_ctx_)->stream(); #else auto stream = nullptr; #endif diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc index 90d5fb3eaeb..16b9b5dc6bd 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc +++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc @@ -17,7 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - merge_selected_rows, - ops::MergeSelectedRowsKernel, - ops::MergeSelectedRowsKernel); +REGISTER_OP_CUDA_KERNEL(merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 1e369c81538..310d28738fc 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -155,5 +155,4 @@ REGISTER_OPERATOR(minus, ops::MinusGradMaker); REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel); -REGISTER_OP_CUDA_KERNEL( - minus, ops::MinusKernel); +REGISTER_OP_CUDA_KERNEL(minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu index 9450b72c95f..67c3a5d90da 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cu +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -76,8 +76,7 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - modified_huber_loss, - ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss, + ops::ModifiedHuberLossKernel); REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc index b74c1fca088..01ca5d43090 100644 --- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc @@ -58,7 +58,7 @@ class NCCLTester : public ::testing::Test { paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list_.size(); ++i) { p::CUDAPlace place(i); - auto *ctx = new p::CUDADeviceContext(place); + auto *ctx = new phi::GPUContext(place); ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx->stream()) .get()); @@ -184,7 +184,7 @@ void NCCLTester::testNcclAllReduceOp() { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - auto *dev_ctx = static_cast(dev_ctxs_[i]); + auto *dev_ctx = static_cast(dev_ctxs_[i]); paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[i]), @@ -296,7 +296,7 @@ void NCCLTester::testNcclBcastOp() { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - auto *dev_ctx = static_cast(dev_ctxs_[idx]); + auto *dev_ctx = static_cast(dev_ctxs_[idx]); paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index 64f5bc9eab4..330163b1f93 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -92,8 +92,7 @@ class NumberCountOpCUDAKernel : public framework::OpKernel { int64_t batch_size = numbers->numel(); auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::DDim out_dims = phi::make_ddim({upper_range}); auto out_data = number_count->mutable_data(out_dims, place); diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 6dfc4a7d13c..85594ff0574 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -99,7 +99,6 @@ class OneHotCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot, - ops::OneHotCUDAKernel, - ops::OneHotCUDAKernel); +REGISTER_OP_CUDA_KERNEL(one_hot, + ops::OneHotCUDAKernel, + ops::OneHotCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index 3314e899a13..205eb2853a3 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -31,7 +31,7 @@ struct CastFunctor { }; template -static void VecCastKernel(const platform::CUDADeviceContext &ctx, +static void VecCastKernel(const phi::GPUContext &ctx, const InT *x, OutT *y, size_t n) { @@ -53,7 +53,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, } // namespace details template -static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, +static void LaunchCastKernel(const phi::GPUContext &ctx, const InT *x, OutT *y, size_t n) { diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu index dc568802a2b..30825a6a329 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu @@ -14,6 +14,5 @@ limitations under the License. */ #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - decayed_adagrad, - ops::DecayedAdagradOpKernel); +REGISTER_OP_CUDA_KERNEL(decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu index e7fdeb617de..7909d58a644 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu @@ -15,6 +15,5 @@ #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - dgc_momentum, - ops::DGCMomentumKernel); +REGISTER_OP_CUDA_KERNEL(dgc_momentum, + ops::DGCMomentumKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 7b1397b7df6..e7d795ccc57 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -178,22 +178,21 @@ static size_t FillAlignmentPaddingInfo(std::vector *infos, } template -static T *TensorFillConstant(const platform::CUDADeviceContext &dev_ctx, +static T *TensorFillConstant(const phi::GPUContext &dev_ctx, framework::Tensor *tensor, const framework::DDim &dims, T value) { tensor->Resize(dims); auto *ptr = tensor->mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; set_constant(dev_ctx, tensor, value); return ptr; } -static framework::Tensor CastDataForInitedTensor( - const platform::CUDADeviceContext &dev_ctx, - framework::Tensor *origin, - framework::Tensor *fused_out, - size_t numel_offset) { +static framework::Tensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx, + framework::Tensor *origin, + framework::Tensor *fused_out, + size_t numel_offset) { PADDLE_ENFORCE_EQ(origin->IsInitialized(), true, platform::errors::InvalidArgument( @@ -338,12 +337,12 @@ static T ClipByBound(T x, T low_value, T high_value) { } template -class DistributedFusedLambInitOpKernel +class DistributedFusedLambInitOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { VLOG(10) << "starts to run DistributedFusedLambInitOp"; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto place = ctx.GetPlace(); auto stream = dev_ctx.stream(); @@ -790,4 +789,4 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_fused_lamb_init, - ops::DistributedFusedLambInitOpKernel); + ops::DistributedFusedLambInitOpKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index f8d55ff9cf7..394e49dd529 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -242,8 +242,7 @@ static void LogParamAndTrustRatioDivSquareNorm( } } -static bool IsFinite(const platform::CUDADeviceContext &dev_ctx, - const float *ptr) { +static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) { auto stream = dev_ctx.stream(); float cpu_value; #ifdef PADDLE_WITH_HIP @@ -509,7 +508,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( template static void MultiTensorUpdateLambMomentAndTrustRatioDiv( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const int *offsets, int n, const T *param_p, @@ -779,7 +778,7 @@ template static void MultiTensorUpdateLambParamAndBetaPows( - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const int *offsets, int n, const MasterT *trust_ratio_div, @@ -898,7 +897,7 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, } template -static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, +static void LaunchScaleKernel(const phi::GPUContext &dev_ctx, const T1 *x, const T2 *scale, T1 *y, @@ -925,7 +924,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff, size_t nranks, ncclComm_t comm, gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const T *scale = nullptr) { static_assert(std::is_same::value || std::is_same::value, @@ -974,15 +973,14 @@ static void NCCLSumWithScaleBase(const T *sendbuff, } template -static void NCCLReduceScatterWithScale( - const T *sendbuff, - T *recvbuff, - size_t recvcount, - size_t nranks, - ncclComm_t comm, - gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, - const T *scale = nullptr) { +static void NCCLReduceScatterWithScale(const T *sendbuff, + T *recvbuff, + size_t recvcount, + size_t nranks, + ncclComm_t comm, + gpuStream_t stream, + const phi::GPUContext &dev_ctx, + const T *scale = nullptr) { NCCLSumWithScaleBase( sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); } @@ -994,7 +992,7 @@ static void NCCLAllReduceWithScale(const T *sendbuff, size_t nranks, ncclComm_t comm, gpuStream_t stream, - const platform::CUDADeviceContext &dev_ctx, + const phi::GPUContext &dev_ctx, const T *scale = nullptr) { NCCLSumWithScaleBase( sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale); @@ -1104,7 +1102,7 @@ static std::string GetMinMaxStr(const T *x, true, platform::errors::InvalidArgument("Only support CUDAPlace currently.")); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); @@ -1276,13 +1274,12 @@ static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x, } template -static void LaunchElementwiseAddWithCastKernel( - const platform::CUDADeviceContext &dev_ctx, - const T1 *x, - const T2 *y, - T3 *z, - int n, - gpuStream_t stream) { +static void LaunchElementwiseAddWithCastKernel(const phi::GPUContext &dev_ctx, + const T1 *x, + const T2 *y, + T3 *z, + int n, + gpuStream_t stream) { int vec_size = std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)), GetChunkedVecSize(z, 0)); @@ -1300,12 +1297,12 @@ static void LaunchElementwiseAddWithCastKernel( } template -class DistributedFusedLambOpKernel +class DistributedFusedLambOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto place = dev_ctx.GetPlace(); @@ -2135,4 +2132,4 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( distributed_fused_lamb, - ops::DistributedFusedLambOpKernel); + ops::DistributedFusedLambOpKernel); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu index acf8e38ca0f..dbea7e4d51c 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cu +++ b/paddle/fluid/operators/optimizers/ftrl_op.cu @@ -13,5 +13,4 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/ftrl_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - ftrl, ops::FTRLOpKernel); +REGISTER_OP_CUDA_KERNEL(ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu index a9f880fdbb6..0d60979eef0 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cu +++ b/paddle/fluid/operators/optimizers/lamb_op.cu @@ -17,7 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lamb, - ops::LambOpKernel, - ops::LambOpKernel, - ops::LambOpKernel); + ops::LambOpKernel, + ops::LambOpKernel, + ops::LambOpKernel); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index d9aef74931a..5337e56b28d 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -419,25 +419,24 @@ __global__ void MomentumLarsKernel(const T* param, } template -inline void SeparatedLarsMomentumOpCUDAKernel( - const platform::CUDADeviceContext& cuda_ctx, - const T* param_data, - T* param_out_data, - const MT* velocity_data, - MT* velocity_out_data, - const T* grad_data, - const MT* lr, - MT* p_buffer, - MT* g_buffer, - const MT mu, - const MT lars_coeff, - const MT weight_decay, - const MT epsilon, - const MT rescale_grad, - const int64_t numel, - const MT* master_param_data, - MT* master_out_data, - const bool is_amp) { +inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx, + const T* param_data, + T* param_out_data, + const MT* velocity_data, + MT* velocity_out_data, + const T* grad_data, + const MT* lr, + MT* p_buffer, + MT* g_buffer, + const MT mu, + const MT lars_coeff, + const MT weight_decay, + const MT epsilon, + const MT rescale_grad, + const int64_t numel, + const MT* master_param_data, + MT* master_out_data, + const bool is_amp) { LarsThreadConfig lars_thread_config(numel); L2NormKernel<< { void Compute(const framework::ExecutionContext& ctx) const override { int num_blocks_per_sm = 0; bool multi_precision = ctx.Attr("multi_precision"); - auto& cuda_ctx = ctx.template device_context(); + auto& cuda_ctx = ctx.template device_context(); int sm_num = cuda_ctx.GetSMCount(); - framework::Tensor tmp_buffer_t = - ctx.AllocateTmpTensor( - {LARS_BLOCK_SIZE << 1}, cuda_ctx); + framework::Tensor tmp_buffer_t = ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; @@ -684,7 +682,6 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lars_momentum, - ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel); + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu index be3f6d6c91a..6419e524f71 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu @@ -20,5 +20,5 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( pow2_decay_with_linear_warmup, - ops::Pow2DecayWithLinearWarmupOpKernel, - ops::Pow2DecayWithLinearWarmupOpKernel); + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu index 591dead3b12..c338f4cc717 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu @@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - proximal_adagrad, - ops::ProximalAdagradOpKernel); +REGISTER_OP_CUDA_KERNEL(proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu index d556fa74f19..edc911134c7 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu @@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/proximal_gd_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - proximal_gd, - ops::ProximalGDOpKernel); +REGISTER_OP_CUDA_KERNEL(proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 6fd49248db1..28ca7c6d8d3 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -65,8 +65,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, } // namespace template -class SGDOpKernel - : public framework::OpKernel { +class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* param_var = ctx.InputVar("Param"); diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu index cbafefb34fd..d8f8e9749b8 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu @@ -19,7 +19,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sparse_momentum, - ops::SparseMomentumOpKernel, - ops::SparseMomentumOpKernel, - ops::SparseMomentumOpKernel); + ops::SparseMomentumOpKernel, + ops::SparseMomentumOpKernel, + ops::SparseMomentumOpKernel); diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index 25dae1ec7f3..5ed217b2e60 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -508,8 +508,8 @@ class Pad2dGradCUDAKernel : public framework::OpKernel { const T* d_out_data = d_out->data(); T* d_in_data = d_in->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), + phi::funcs::SetConstant set_zero; + set_zero(context.template device_context(), d_in, static_cast(0)); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index e523c93f5d1..254e8ebe5c5 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -259,17 +259,14 @@ REGISTER_OP_CPU_KERNEL( ops::PadConstantLikeGradKernel, ops::PadConstantLikeGradKernel); -REGISTER_OP_CUDA_KERNEL( - pad_constant_like, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel); +REGISTER_OP_CUDA_KERNEL(pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); REGISTER_OP_CUDA_KERNEL( pad_constant_like_grad, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel); + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu index 7e365dbeb1d..f4d8f7083b0 100644 --- a/paddle/fluid/operators/partial_concat_op.cu +++ b/paddle/fluid/operators/partial_concat_op.cu @@ -101,7 +101,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel { int all_length = batch_size * out_batch_len; constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; @@ -171,8 +171,8 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { auto grad_batch_len = partial_len * in_num; auto all_length = grad_batch_len * batch_size; // initialize - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); @@ -180,7 +180,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { } constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu index a8d0b145082..69517233bf3 100644 --- a/paddle/fluid/operators/partial_sum_op.cu +++ b/paddle/fluid/operators/partial_sum_op.cu @@ -94,7 +94,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel { } constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; @@ -163,8 +163,8 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { } // initialize - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); @@ -180,7 +180,7 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { auto out_num = outs.size(); constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); auto sm_count = max_threads / theory_sm_threads; diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu index 04249d37794..ac4666bb174 100644 --- a/paddle/fluid/operators/prroi_pool_op.cu +++ b/paddle/fluid/operators/prroi_pool_op.cu @@ -426,7 +426,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(prroi_pool, ops::GPUPRROIPoolOpKernel, ops::GPUPRROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - prroi_pool_grad, - ops::GPUPRROIPoolGradOpKernel, - ops::GPUPRROIPoolGradOpKernel); +REGISTER_OP_CUDA_KERNEL(prroi_pool_grad, + ops::GPUPRROIPoolGradOpKernel, + ops::GPUPRROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 85d57974ede..3b626cd762e 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -130,4 +130,4 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel { REGISTER_OP_CUDA_KERNEL( prune_gate_by_capacity, - ops::PruneGateByCapacityCUDAKernel); + ops::PruneGateByCapacityCUDAKernel); diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc index c8342e6d5d1..6fe0156c01a 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc @@ -19,4 +19,4 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_lookup_table, - ops::DistributedLookupTableKernel); + ops::DistributedLookupTableKernel); diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc index 5c4ae3bdcfe..bba442a630a 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc @@ -19,5 +19,5 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( distributed_push_sparse, - ops::DistributedPushSparseKernel, - ops::DistributedPushSparseKernel); + ops::DistributedPushSparseKernel, + ops::DistributedPushSparseKernel); diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 73eb3f15092..d3f1d17e7a3 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -98,12 +98,11 @@ class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker); -REGISTER_OP_CUDA_KERNEL( - send_and_recv, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel); +REGISTER_OP_CUDA_KERNEL(send_and_recv, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel); REGISTER_OP_CPU_KERNEL(send_and_recv, ops::SendAndRecvKernel, ops::SendAndRecvKernel, diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 8d0d2d3090c..9aef7051fa5 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -100,8 +100,7 @@ void InitTensorsOnClient(framework::Scope* scope, // ids_var->mutable_data(framework::DDim({rows_numel, 1}), // *place); // for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - auto stream = - reinterpret_cast(ctx).stream(); + auto stream = reinterpret_cast(ctx).stream(); auto micro_id_var = scope->Var("microbatch_id")->GetMutable(); @@ -245,7 +244,7 @@ TEST(SENDANDRECV, GPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 9255a5f164b..9c13934ccd4 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -223,21 +223,17 @@ REGISTER_OP_CPU_KERNEL( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( py_layer, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel>, - ops::PyLayerOpKernel>); + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel>, + ops::PyLayerOpKernel>); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 24ae989532d..8ae18a56329 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -36,8 +36,7 @@ class QrGPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { bool compute_q; bool reduced_mode; - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); const Tensor& x = *context.Input("X"); Tensor& q = *context.Output("Q"); Tensor& r = *context.Output("R"); @@ -69,8 +68,7 @@ class QrGPUKernel : public framework::OpKernel { size_t(batch_size * k * n * sizeof(phi::dtype::Real))); auto dito = - math::DeviceIndependenceTensorOperations(context); + math::DeviceIndependenceTensorOperations(context); // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr @@ -94,7 +92,7 @@ class QrGPUKernel : public framework::OpKernel { auto qr_data = qr.mutable_data(context.GetPlace()); auto tau_data = tau.mutable_data(context.GetPlace()); - BatchedGeqrf( + BatchedGeqrf( dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride); if (reduced_mode) { @@ -114,16 +112,16 @@ class QrGPUKernel : public framework::OpKernel { // Perform QRGQR for Q using the result from GEQRF // Transpose 'q' to retore the original row-major order if (reduced_mode) { - BatchedOrgqr(dev_ctx, - batch_size, - m, - min_mn, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + min_mn, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = dito.Transpose(qr); auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn}); framework::TensorCopy(sliced_q, q.place(), &q); @@ -142,29 +140,29 @@ class QrGPUKernel : public framework::OpKernel { qr_stride * sizeof(phi::dtype::Real), dev_ctx.stream()); } - BatchedOrgqr(dev_ctx, - batch_size, - m, - m, - min_mn, - new_qr_data, - m, - tau_data, - new_qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + m, + min_mn, + new_qr_data, + m, + tau_data, + new_qr_stride, + tau_stride); auto trans_q = dito.Transpose(new_qr); framework::TensorCopy(trans_q, q.place(), &q); } else { - BatchedOrgqr(dev_ctx, - batch_size, - m, - m, - min_mn, - qr_data, - m, - tau_data, - qr_stride, - tau_stride); + BatchedOrgqr(dev_ctx, + batch_size, + m, + m, + min_mn, + qr_data, + m, + tau_data, + qr_stride, + tau_stride); auto trans_q = dito.Transpose(qr); auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m}); framework::TensorCopy(sliced_q, q.place(), &q); @@ -175,16 +173,15 @@ class QrGPUKernel : public framework::OpKernel { }; template <> -void BatchedGeqrf( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - float* a, - int lda, - float* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + float* a, + int lda, + float* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -227,16 +224,15 @@ void BatchedGeqrf( } template <> -void BatchedGeqrf( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - double* a, - int lda, - double* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + double* a, + int lda, + double* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -279,17 +275,16 @@ void BatchedGeqrf( } template <> -void BatchedOrgqr( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - float* a, - int lda, - float* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -333,17 +328,16 @@ void BatchedOrgqr( } template <> -void BatchedOrgqr( - const platform::CUDADeviceContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - double* a, - int lda, - double* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const phi::GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + int a_stride, + int tau_stride) { int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); @@ -391,9 +385,8 @@ void BatchedOrgqr( namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel, ops::QrGPUKernel); -REGISTER_OP_CUDA_KERNEL( - qr_grad, - ops::QrGradKernel, - ops::QrGradKernel); +REGISTER_OP_CUDA_KERNEL(qr_grad, + ops::QrGradKernel, + ops::QrGradKernel); #endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index 93c688aa642..37ca11db3e3 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -24,8 +24,8 @@ namespace paddle { namespace operators { template -struct ChannelDequantizeFunctorV2 { - void operator()(const platform::CUDADeviceContext& dev_ctx, +struct ChannelDequantizeFunctorV2 { + void operator()(const phi::GPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, T max_range, @@ -61,14 +61,14 @@ struct ChannelDequantizeFunctorV2 { } }; -template struct ChannelDequantizeFunctorV2; -template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; +using CUDA = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(dequantize_linear, ops::DeQuantizeLinearKernel, ops::DeQuantizeLinearKernel, diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu index 55f7615d0f1..8a10b96a6f0 100644 --- a/paddle/fluid/operators/random_crop_op.cu +++ b/paddle/fluid/operators/random_crop_op.cu @@ -16,7 +16,7 @@ namespace ops = paddle::operators; template -using Kernel = ops::RandomCropKernel; +using Kernel = ops::RandomCropKernel; REGISTER_OP_CUDA_KERNEL(random_crop, Kernel, Kernel, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index aee430b5057..253560d981d 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -39,7 +39,7 @@ struct Random { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -struct Random { +struct Random { using Engine = thrust::minstd_rand; template diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu index 61e38fb00fc..0b8aaf2d970 100644 --- a/paddle/fluid/operators/random_routing_op.cu +++ b/paddle/fluid/operators/random_routing_op.cu @@ -60,8 +60,7 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel { auto out = context.Output("Out"); auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); + const auto& dev_ctx = context.template device_context(); framework::TensorCopy(*topk_idx, place, out); size_t N = topk_idx->dims()[0]; diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 7571fcae270..83f6f23f985 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -62,7 +62,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel { int block_matrix_row = max_rank * x_fea_dim; - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); int max_ins = std::max(ins_num, max_size); @@ -83,8 +83,8 @@ class RankAttentionCUDAKernel : public framework::OpKernel { auto ins_rank_eigen = framework::EigenVector::Flatten(*ins_rank); auto out_eigen = framework::EigenVector::Flatten(*Out); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = + *ctx.template device_context().eigen_device(); param_help_eigen.device(place) = param_help_eigen.constant(static_cast(0)); @@ -135,7 +135,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel { int64_t strideA = block_matrix_row; int64_t strideB = block_matrix_row * para_col; - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); blas.BatchedGEMM(transA, transB, 1, @@ -176,9 +176,9 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { auto rank_offset_dims = rank_offset->dims(); auto max_rank = (rank_offset_dims[1] - 1) / 2; int block_matrix_row = max_rank * x_fea_dim; - auto &dev_ctx = ctx.template device_context(); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &dev_ctx = ctx.template device_context(); + auto &place = + *ctx.template device_context().eigen_device(); int max_ins = std::max(ins_num, max_size); // initialize out grad @@ -201,7 +201,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { const T *ins_rank_data = ins_rank->data(); T *param_grad_data = param_grad.data(); - auto blas = phi::funcs::GetBlas(dev_ctx); + auto blas = phi::funcs::GetBlas(dev_ctx); T alpha = 1; T beta = 0; @@ -242,7 +242,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using GPUCtx = paddle::platform::CUDADeviceContext; +using GPUCtx = phi::GPUContext; REGISTER_OP_CUDA_KERNEL(rank_attention, ops::RankAttentionCUDAKernel, ops::RankAttentionCUDAKernel); diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index edf82d00950..b353b2992ce 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -245,10 +245,7 @@ REGISTER_OP_CPU_KERNEL(rank_loss_grad, ops::RankLossGradKernel); REGISTER_OP_CUDA_KERNEL( - rank_loss, - paddle::operators::RankLossKernel); + rank_loss, paddle::operators::RankLossKernel); REGISTER_OP_CUDA_KERNEL( rank_loss_grad, - paddle::operators::RankLossGradKernel); + paddle::operators::RankLossGradKernel); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index a36d51e42f5..b9c608b62e7 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -52,8 +52,8 @@ BufferedReader::BufferedReader( if (platform::is_gpu_place(place_) && !pin_memory) { int dev_idx = place_.device; compute_stream_ = - ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance() - .Get(place_))) + ((phi::GPUContext *)(platform::DeviceContextPool::Instance().Get( + place_))) ->stream(); events_.resize(buffer_size); for (auto &event : events_) { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 95bb0610771..d7f153700cf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -31,7 +31,7 @@ template class ReduceOp, typename TransformOp> -void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, +void TensorReduceImpl(const phi::GPUContext& dev_ctx, const framework::Tensor& x, framework::Tensor* y, const TransformOp& transform, diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index e022e128c7f..ea21b985e7f 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -161,8 +161,7 @@ class CUDARenormKernel : public framework::OpKernel { std::vector ins = {x}; std::vector outs = {&pow_value}; auto func = UnsignedPowFunctor(p); - const auto& cuda_ctx = - context.template device_context(); + const auto& cuda_ctx = context.template device_context(); paddle::operators::LaunchSameDimsElementwiseCudaKernel( cuda_ctx, ins, &outs, func); diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu index 3371134f344..07099c30271 100644 --- a/paddle/fluid/operators/repeat_interleave_op.cu +++ b/paddle/fluid/operators/repeat_interleave_op.cu @@ -88,8 +88,7 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel { auto stride_dim = phi::stride(input_dim); int64_t stride = stride_dim[dim]; - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); int repeats = context.Attr("Repeats"); framework::LoDTensor index; @@ -218,8 +217,7 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { int64_t numel = in_grad->numel(); int64_t out_nums = output_grad->numel(); - auto stream = - context.template device_context().stream(); + auto stream = context.template device_context().stream(); index_select_grad_init <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, @@ -328,23 +326,16 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( repeat_interleave, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel, - ops::RepeatInterleaveCUDAKernel); + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel, + ops::RepeatInterleaveCUDAKernel); REGISTER_OP_CUDA_KERNEL( repeat_interleave_grad, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel, - ops::RepeatInterleaveGradCUDAKernel); + ops::RepeatInterleaveGradCUDAKernel, + ops::RepeatInterleaveGradCUDAKernel); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index b665cce0962..6a25e2c7902 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -428,7 +428,7 @@ class ReshapeKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeKernel(static_cast(dev_ctx), *in, pt_scalar_shape, @@ -461,7 +461,7 @@ class ReshapeGradKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } @@ -491,7 +491,7 @@ class ReshapeDoubleGradKernel { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 633811862d8..f69889f7f8f 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -323,8 +323,7 @@ __global__ void RowConvGradFilter(const T *in, } // namespace template -class RowConvKernel - : public framework::OpKernel { +class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -378,8 +377,7 @@ class RowConvKernel }; template -class RowConvGradKernel - : public framework::OpKernel { +class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -418,7 +416,7 @@ class RowConvGradKernel size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace()); auto &device_ctx = context.cuda_device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant zero; if (dFilter) { T *dfilter = dFilter->mutable_data(context.GetPlace()); @@ -494,8 +492,6 @@ class RowConvGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - row_conv, ops::RowConvKernel); -REGISTER_OP_CUDA_KERNEL( - row_conv_grad, - ops::RowConvGradKernel); +REGISTER_OP_CUDA_KERNEL(row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL(row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/run_program_op.cu.cc b/paddle/fluid/operators/run_program_op.cu.cc index 19cd354c18f..b3383434203 100644 --- a/paddle/fluid/operators/run_program_op.cu.cc +++ b/paddle/fluid/operators/run_program_op.cu.cc @@ -20,9 +20,7 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; /* see [Why use single type kernel] */ -REGISTER_OP_CUDA_KERNEL( - run_program, - ops::RunProgramOpKernel); -REGISTER_OP_CUDA_KERNEL( - run_program_grad, - ops::RunProgramGradOpKernel); +REGISTER_OP_CUDA_KERNEL(run_program, + ops::RunProgramOpKernel); +REGISTER_OP_CUDA_KERNEL(run_program_grad, + ops::RunProgramGradOpKernel); diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu index 1fb9942b37a..d0d8af95a3f 100644 --- a/paddle/fluid/operators/sample_logits_op.cu +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -145,7 +145,7 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { // UNDERSTAND: allocate memories for temporaries sampled_logits->mutable_data(samples_dim, context.GetPlace()); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, sampled_logits, static_cast(0)); auto sampled_labels_data = @@ -244,7 +244,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel { logits_grad->mutable_data(context.GetPlace()); auto& dev_ctx = context.cuda_device_context(); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, logits_grad, static_cast(0)); // UNDERSTAND: scatter it back to logit_grad diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu index 71476fd802b..e96aafa3829 100644 --- a/paddle/fluid/operators/save_combine_op.cu +++ b/paddle/fluid/operators/save_combine_op.cu @@ -16,9 +16,8 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - save_combine, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel); +REGISTER_OP_CUDA_KERNEL(save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu index 056894dbae1..03753b6e7e3 100644 --- a/paddle/fluid/operators/save_op.cu +++ b/paddle/fluid/operators/save_op.cu @@ -19,11 +19,10 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( save, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel); + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 9b1d7a27e58..be406db5056 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -53,6 +53,5 @@ class GPUSeedKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - seed, - paddle::operators::GPUSeedKernel); +REGISTER_OP_CUDA_KERNEL(seed, + paddle::operators::GPUSeedKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc index f2117a2f098..2374ec02e8f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc @@ -18,21 +18,13 @@ REGISTER_OP_CUDA_KERNEL( sequence_concat, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel, - paddle::operators::SeqConcatKernel); + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel, + paddle::operators::SeqConcatKernel); REGISTER_OP_CUDA_KERNEL( sequence_concat_grad, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel, - paddle::operators::SeqConcatGradKernel); + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel, + paddle::operators::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc index 600981b5e96..5939ede964c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_conv, - ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); +REGISTER_OP_CUDA_KERNEL(sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL(sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index 363c40ce26d..cacd777f17e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -66,9 +66,9 @@ static __global__ void sequence_expand_as_grad_kernel( } template -struct SequenceExpandAsFunctor { +struct SequenceExpandAsFunctor { void operator()( - const platform::CUDADeviceContext &context, + const phi::GPUContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ LoDTensor *out) { @@ -97,8 +97,8 @@ struct SequenceExpandAsFunctor { }; template -struct SequenceExpandAsGradFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceExpandAsGradFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &dout, const framework::Vector &ref_lod, /*expand based lod*/ LoDTensor *dx) { @@ -133,17 +133,14 @@ struct SequenceExpandAsGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_expand_as, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel); +REGISTER_OP_CUDA_KERNEL(sequence_expand_as, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel); REGISTER_OP_CUDA_KERNEL( sequence_expand_as_grad, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel); + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index 5ba02527825..f6e082f4d2a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -96,7 +96,7 @@ void GetOutputOffset(const framework::Vector& x_lod, } template -static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, +static int ExpandByMemoryCopy(const phi::GPUContext& context, const LoDTensor& x, LoDTensor* out, const framework::Vector& x_lod, @@ -142,9 +142,9 @@ static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context, } template -struct SequenceExpandFunctor { +struct SequenceExpandFunctor { void operator()( - const platform::CUDADeviceContext& context, + const phi::GPUContext& context, const LoDTensor& x, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ @@ -194,8 +194,8 @@ struct SequenceExpandFunctor { }; template -struct SequenceExpandGradFunctor { - void operator()(const platform::CUDADeviceContext& context, +struct SequenceExpandGradFunctor { + void operator()(const phi::GPUContext& context, const LoDTensor& dout, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand based lod*/ @@ -228,16 +228,14 @@ struct SequenceExpandGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_expand, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel); +REGISTER_OP_CUDA_KERNEL(sequence_expand, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel); REGISTER_OP_CUDA_KERNEL( sequence_expand_grad, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel); + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu index e963ce610e2..b4284d2717a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu @@ -16,11 +16,7 @@ REGISTER_OP_CUDA_KERNEL( sequence_mask, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel); + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu index 7fc64a530ef..84a3e8da141 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu @@ -15,15 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_pad, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_pad_grad, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu index 4897474a485..882ec66f501 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_pool, - ops::SequencePoolKernel); -REGISTER_OP_CUDA_KERNEL( - sequence_pool_grad, - ops::SequencePoolGradKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL(sequence_pool_grad, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu index 38bc599165d..eaf34643a07 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_reshape, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel); +REGISTER_OP_CUDA_KERNEL(sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); REGISTER_OP_CUDA_KERNEL( sequence_reshape_grad, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel); + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu index 0a59ed7f9fe..810130669b5 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu @@ -16,10 +16,9 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_reverse, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_reverse, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu index a4b0ea2e5b2..ecf39a07309 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_slice, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_slice, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel); REGISTER_OP_CUDA_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc index 58e99364f4f..b060aa9f08b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc @@ -61,10 +61,8 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel { phi::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxCUDNNFunctor()( - ctx.template device_context(), - &x_i, - &out_i); + math::SoftmaxCUDNNFunctor()( + ctx.template device_context(), &x_i, &out_i); } } }; @@ -97,8 +95,8 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradCUDNNFunctor()( - ctx.template device_context(), + math::SoftmaxGradCUDNNFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, &x_grad_i); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index bb0ad26b51b..5417c20f3d4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -40,8 +40,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { bool runtime_cudnn_support = false; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } #endif @@ -149,8 +148,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { bool runtime_cudnn_support = false; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; } #endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 696f6e7ca88..360f9055519 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -121,8 +121,8 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data, } template -struct SequenceSoftmaxFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceSoftmaxFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &x, const framework::Vector &ref_lod, /*referenced lod*/ LoDTensor *out) { @@ -146,8 +146,8 @@ struct SequenceSoftmaxFunctor { }; template -struct SequenceSoftmaxGradFunctor { - void operator()(const platform::CUDADeviceContext &context, +struct SequenceSoftmaxGradFunctor { + void operator()(const phi::GPUContext &context, const LoDTensor &dout, const LoDTensor &out, const framework::Vector &ref_lod, /*referenced lod*/ @@ -177,12 +177,10 @@ struct SequenceSoftmaxGradFunctor { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_softmax, - ops::SequenceSoftmaxKernel, - ops::SequenceSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL(sequence_softmax, + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel); REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu index bf54f77f5b5..4124e17cb09 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu @@ -15,16 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - sequence_unpad, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); REGISTER_OP_CUDA_KERNEL( sequence_unpad_grad, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel); + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu index 7803f407181..6b70b8d37d7 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cu +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -88,7 +88,7 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel { auto *shuffleidx_data = shuffleidx->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); #ifdef PADDLE_WITH_CUDA const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); #else @@ -106,8 +106,8 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel { auto *out_data = out->mutable_data(ctx.GetPlace()); ReorderFunctor functor( x_data, shuffleidx_data, out_data, x_embed_size); - platform::ForRange for_range( - dev_ctx, elem_size * x_embed_size); + platform::ForRange for_range(dev_ctx, + elem_size * x_embed_size); for_range(functor); auto *seed_out_data = seed_out->mutable_data(phi::make_ddim({1}), @@ -136,10 +136,9 @@ class ShuffleBatchGradCUDAKernel : public framework::OpKernel { auto x_embed_size = x_grad->dims()[x_grad->dims().size() - 1]; ReorderFunctor functor( out_grad_data, shuffleidx_data, x_grad_data, x_embed_size); - auto &dev_ctx = ctx.template device_context(); + auto &dev_ctx = ctx.template device_context(); // TODO(zengjinle): for small data, direct cudaMemcpy may be better - platform::ForRange for_range(dev_ctx, - x_grad->numel()); + platform::ForRange for_range(dev_ctx, x_grad->numel()); for_range(functor); #endif } diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 02c0cfdd969..f51724d8431 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -129,12 +129,9 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( shuffle_channel, - ops::ShuffleChannelOpCUDAKernel, - ops::ShuffleChannelOpCUDAKernel); + ops::ShuffleChannelOpCUDAKernel, + ops::ShuffleChannelOpCUDAKernel); REGISTER_OP_CUDA_KERNEL( shuffle_channel_grad, - ops::ShuffleChannelGradOpCUDAKernel, - ops::ShuffleChannelGradOpCUDAKernel); + ops::ShuffleChannelGradOpCUDAKernel, + ops::ShuffleChannelGradOpCUDAKernel); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 4e812261883..f42ebbe0399 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -488,32 +488,24 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CUDA_KERNEL( slice, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel>, - ops::SliceKernel>); + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel>, + ops::SliceKernel>); REGISTER_OP_CUDA_KERNEL( slice_grad, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel>, - ops::SliceGradKernel>); + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel>, + ops::SliceGradKernel>); diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu index e5df479090f..d57b96d0ec5 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cu +++ b/paddle/fluid/operators/smooth_l1_loss_op.cu @@ -14,9 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - smooth_l1_loss, - ops::SmoothL1LossKernel); -REGISTER_OP_CUDA_KERNEL( - smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); +REGISTER_OP_CUDA_KERNEL(smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL(smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu index 5a9f7c288d1..f9df5a5f74b 100644 --- a/paddle/fluid/operators/space_to_depth_op.cu +++ b/paddle/fluid/operators/space_to_depth_op.cu @@ -17,16 +17,14 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - space_to_depth, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel); +REGISTER_OP_CUDA_KERNEL(space_to_depth, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel); -REGISTER_OP_CUDA_KERNEL( - space_to_depth_grad, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel); +REGISTER_OP_CUDA_KERNEL(space_to_depth_grad, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel); diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu index 423ec727108..8bf431e59f0 100644 --- a/paddle/fluid/operators/sparse_attention_op.cu +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -209,7 +209,7 @@ input: sparse C in CSR format (num_rows,num_rows) output: sparse C after softmax operation */ template -void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, +void SparseSoftmaxForward(const phi::GPUContext& ctx, const Tensor* offset, const Tensor* columns, Tensor* input, @@ -322,7 +322,7 @@ void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, } template -void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, +void SparseSoftmaxBackward(const phi::GPUContext& ctx, const Tensor* offset, const Tensor* columns, Tensor* dx, @@ -453,7 +453,7 @@ input: dense A (num_rows,num_cols), dense B (num_rows,num_cols) output: sparse C in CSR format (num_rows,num_rows) */ template -void DotSdd(const platform::CUDADeviceContext& ctx, +void DotSdd(const phi::GPUContext& ctx, const Tensor* a, const Tensor* b, const Tensor* c_offset, @@ -546,7 +546,7 @@ input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols) output: dense C (num_rows,num_cols) */ template -void DotDsd(const platform::CUDADeviceContext& ctx, +void DotDsd(const phi::GPUContext& ctx, const Tensor* a_offset, const Tensor* a_columns, const Tensor* a_value, @@ -881,10 +881,10 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel { } // namespace paddle REGISTER_OP_CUDA_KERNEL( sparse_attention, - ops::SparseAttentionCUDAKernel, - ops::SparseAttentionCUDAKernel); + ops::SparseAttentionCUDAKernel, + ops::SparseAttentionCUDAKernel); REGISTER_OP_CUDA_KERNEL( sparse_attention_grad, - ops::SparseAttentionGradCUDAKernel, - ops::SparseAttentionGradCUDAKernel); + ops::SparseAttentionGradCUDAKernel, + ops::SparseAttentionGradCUDAKernel); diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index f1134726998..661fcc83771 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -13,32 +13,26 @@ #include "paddle/fluid/operators/spectral_op.cu.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2c, + ops::FFTC2CKernel, + ops::FFTC2CKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2c_grad, + ops::FFTC2CGradKernel, + ops::FFTC2CGradKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2r, + ops::FFTC2RKernel, + ops::FFTC2RKernel); -REGISTER_OP_CUDA_KERNEL( - fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_c2r_grad, + ops::FFTC2RGradKernel, + ops::FFTC2RGradKernel); -REGISTER_OP_CUDA_KERNEL( - fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); +REGISTER_OP_CUDA_KERNEL(fft_r2c, + ops::FFTR2CKernel, + ops::FFTR2CKernel); -REGISTER_OP_CUDA_KERNEL( - fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); +REGISTER_OP_CUDA_KERNEL(fft_r2c_grad, + ops::FFTR2CGradKernel, + ops::FFTR2CGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h index d7911d8ef18..5ced67691ee 100644 --- a/paddle/fluid/operators/spectral_op.cu.h +++ b/paddle/fluid/operators/spectral_op.cu.h @@ -907,8 +907,8 @@ static bool use_optimized_fft_path(const std::vector& axes) { } template -struct FFTC2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTC2CFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -934,7 +934,7 @@ struct FFTC2CFunctor { std::min(static_cast(kMaxFFTNdim), working_axes.size()); first_dims.assign(working_axes.end() - max_dims, working_axes.end()); - exec_fft( + exec_fft( ctx, p_working_tensor, p_out, first_dims, forward); working_axes.resize(working_axes.size() - max_dims); first_dims.clear(); @@ -945,14 +945,14 @@ struct FFTC2CFunctor { std::swap(p_out, p_working_tensor); } - exec_normalization( + exec_normalization( ctx, p_out, out, normalization, out_dims, axes); } }; template -struct FFTC2RFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTC2RFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -965,28 +965,27 @@ struct FFTC2RFunctor { framework::Tensor x_copy(X->type()); x_copy.mutable_data(X->dims(), ctx.GetPlace()); framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); - exec_fft( - ctx, &x_copy, out, axes, forward); + exec_fft(ctx, &x_copy, out, axes, forward); } else { framework::Tensor temp_tensor; temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); const std::vector dims(axes.begin(), axes.end() - 1); - FFTC2CFunctor c2c_functor; + FFTC2CFunctor c2c_functor; c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); - exec_fft( + exec_fft( ctx, &temp_tensor, out, {axes.back()}, forward); } - exec_normalization( + exec_normalization( ctx, out, out, normalization, out_dims, axes); } }; // n dimension real to complex FFT use cufft lib template -struct FFTR2CFunctor { - void operator()(const platform::CUDADeviceContext& ctx, +struct FFTR2CFunctor { + void operator()(const phi::GPUContext& ctx, const Tensor* X, Tensor* out, const std::vector& axes, @@ -996,22 +995,21 @@ struct FFTR2CFunctor { framework::Tensor* r2c_out = out; const std::vector last_dim{axes.back()}; std::vector out_dims = phi::vectorize(out->dims()); - exec_fft( - ctx, X, r2c_out, last_dim, forward); + exec_fft(ctx, X, r2c_out, last_dim, forward); // Step2: C2C transform on the remaining dimension framework::Tensor c2c_out; if (axes.size() > 1) { c2c_out.mutable_data(out->dims(), ctx.GetPlace()); std::vector remain_dim(axes.begin(), axes.end() - 1); - FFTC2CFunctor fft_c2c_func; + FFTC2CFunctor fft_c2c_func; fft_c2c_func( ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward); } const auto in_sizes = phi::vectorize(X->dims()); framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; - exec_normalization( + exec_normalization( ctx, norm_tensor, out, normalization, in_sizes, axes); } }; diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc index f18efe4a035..24f4d65f661 100644 --- a/paddle/fluid/operators/spp_op.cu.cc +++ b/paddle/fluid/operators/spp_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/spp_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - spp, - ops::SppKernel, - ops::SppKernel); -REGISTER_OP_CUDA_KERNEL( - spp_grad, - ops::SppGradKernel, - ops::SppGradKernel); +REGISTER_OP_CUDA_KERNEL(spp, + ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CUDA_KERNEL(spp_grad, + ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu index 9cef47bd07e..c10cbfb42f1 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.cu +++ b/paddle/fluid/operators/squared_l2_distance_op.cu @@ -14,10 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - squared_l2_distance, - ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance, + ops::SquaredL2DistanceKernel); REGISTER_OP_CUDA_KERNEL( squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceGradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc index c7a96d03173..a77b369c403 100644 --- a/paddle/fluid/operators/squeeze_op.cu.cc +++ b/paddle/fluid/operators/squeeze_op.cu.cc @@ -19,31 +19,27 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel>, - ops::SqueezeKernel>); + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel>, + ops::SqueezeKernel>); REGISTER_OP_CUDA_KERNEL( squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel>); + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel>, + ops::SqueezeGradKernel>); diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu index 7bc3396064c..9edee0f66c5 100644 --- a/paddle/fluid/operators/stft_op.cu +++ b/paddle/fluid/operators/stft_op.cu @@ -17,12 +17,10 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - stft, - ops::StftKernel, - ops::StftKernel); +REGISTER_OP_CUDA_KERNEL(stft, + ops::StftKernel, + ops::StftKernel); -REGISTER_OP_CUDA_KERNEL( - stft_grad, - ops::StftGradKernel, - ops::StftGradKernel); +REGISTER_OP_CUDA_KERNEL(stft_grad, + ops::StftGradKernel, + ops::StftGradKernel); diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index d65fc9ea808..350c3820a38 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -110,8 +110,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto& gpu_place = place; - auto& cuda_ctx = - reinterpret_cast(ctx); + auto& cuda_ctx = reinterpret_cast(ctx); memory::Copy(gpu_place, dst + i * dst_after, gpu_place, diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc index e16df345427..3d8902a68ac 100644 --- a/paddle/fluid/operators/strided_memcpy_test.cc +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -86,7 +86,7 @@ TEST(StridedMemcpy, GPUCrop) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; - platform::CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -128,7 +128,7 @@ TEST(StridedMemcpy, GPUConcat) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; - platform::CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 7b307413cd3..2cc17de1820 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -77,8 +77,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { const size_t in_num = in_vars.size(); constexpr size_t theory_sm_threads = 1024; - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); auto stream = dev_ctx.stream(); auto max_threads = dev_ctx.GetMaxPhysicalThreadCount(); @@ -138,11 +137,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { int start = in_place ? 1 : 0; if (!in_place) { - phi::funcs::SetConstant constant_functor; - constant_functor( - context.template device_context(), - out, - static_cast(0)); + phi::funcs::SetConstant constant_functor; + constant_functor(context.template device_context(), + out, + static_cast(0)); } std::vector in_data; @@ -243,8 +241,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } template -class SumKernel - : public framework::OpKernel { +class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto out_var = context.OutputVar("Out"); @@ -252,9 +249,9 @@ class SumKernel if (out_var->IsType()) { SumToLoDTensor(context); } else if (out_var->IsType()) { - SelectedRowsCompute(context); + SelectedRowsCompute(context); } else if (out_var->IsType()) { - LodTensorArrayCompute(context); + LodTensorArrayCompute(context); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Expected type of Output(out) must be Tensor, SelectedRows or " @@ -269,11 +266,10 @@ class SumKernel namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - sum, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CUDA_KERNEL(sum, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h index f531df936cd..ef8a041fc5a 100644 --- a/paddle/fluid/operators/tensor_to_string.h +++ b/paddle/fluid/operators/tensor_to_string.h @@ -38,7 +38,7 @@ static std::vector ToVector(const T *x, using CopyT = typename std:: conditional::value, uint8_t, T>::type; std::vector cpu_x(n); - auto *dev_ctx = static_cast( + auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); memory::Copy(platform::CPUPlace(), cpu_x.data(), diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 1cd2683796a..b13996b6fab 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -471,8 +471,7 @@ class TensorRTEngineOp : public framework::OperatorBase { int runtime_batch = -1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); + auto stream = reinterpret_cast(dev_ctx).stream(); std::vector output_maps = Attr>("output_name_mapping"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 8e2b162babc..33ebaff8eab 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope, auto dims = phi::make_ddim(shape); tensor->Resize(dims); platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); @@ -142,7 +142,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); @@ -171,7 +171,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + phi::GPUContext ctx(place); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, ctx.stream()) .get()); diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h index 6df883e8333..1162bf21592 100644 --- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h @@ -96,10 +96,9 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim, #if defined(__NVCC__) || defined(__HIPCC__) if (platform::is_gpu_place(place)) { - auto &cuda_dev_ctx = dynamic_cast(dev_ctx); + auto &cuda_dev_ctx = dynamic_cast(dev_ctx); functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx); - platform::ForRange for_range(cuda_dev_ctx, - limit); + platform::ForRange for_range(cuda_dev_ctx, limit); for_range(actual_functor); } else { #endif diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index 390ed2b2ff3..4a038c93a1f 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -902,7 +902,7 @@ __global__ void AssignGradWithAxis(const T* grad_out, } // use the radix sort for the topk template -bool SortTopk(const platform::CUDADeviceContext& ctx, +bool SortTopk(const phi::GPUContext& ctx, const framework::Tensor* input_tensor, const int64_t num_cols, const int64_t num_rows, diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 4910d1cf259..79236f590f7 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -157,26 +157,18 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { } // namespace paddle REGISTER_OP_CUDA_KERNEL( top_k, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, - paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel, + paddle::operators::TopkOpCUDAKernel); REGISTER_OP_CUDA_KERNEL( top_k_grad, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, - paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel, + paddle::operators::TopkOpGradCUDAKernel); diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu index 17d52cea1e0..1e4ca7bb838 100644 --- a/paddle/fluid/operators/tree_conv_op.cu +++ b/paddle/fluid/operators/tree_conv_op.cu @@ -15,11 +15,9 @@ #include "paddle/fluid/operators/tree_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - tree_conv, - ops::TreeConvKernel, - ops::TreeConvKernel); -REGISTER_OP_CUDA_KERNEL( - tree_conv_grad, - ops::TreeConvGradKernel, - ops::TreeConvGradKernel); +REGISTER_OP_CUDA_KERNEL(tree_conv, + ops::TreeConvKernel, + ops::TreeConvKernel); +REGISTER_OP_CUDA_KERNEL(tree_conv_grad, + ops::TreeConvGradKernel, + ops::TreeConvGradKernel); diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu index febb093ed70..a3490937410 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op.cu +++ b/paddle/fluid/operators/uniform_random_inplace_op.cu @@ -32,12 +32,11 @@ class GPUUniformRandomInplaceGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* dx = ctx.Output(framework::GradVarName("X")); auto dims = vectorize(dx->dims()); - const auto& dev_cxt = - ctx.template device_context(); + const auto& dev_cxt = ctx.template device_context(); float value = static_cast(0.0f); phi::FullKernel( static_cast::TYPE&>(dev_cxt), + phi::GPUContext>::TYPE&>(dev_cxt), dims, value, phi::DataType::UNDEFINED, diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 0614e0920df..9f0f93f5573 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -150,8 +150,7 @@ template void UniformRandom(const framework::ExecutionContext& context, framework::Tensor* tensor) { int64_t size = tensor->numel(); - auto& dev_cxt = - context.template device_context(); + auto& dev_cxt = context.template device_context(); T* data = tensor->mutable_data(dev_cxt.GetPlace()); if (size <= 0) return; unsigned int seed = static_cast(context.Attr("seed")); diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc index 71c05658033..82890419daf 100644 --- a/paddle/fluid/operators/unpool_op.cu.cc +++ b/paddle/fluid/operators/unpool_op.cu.cc @@ -15,19 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CUDA_KERNEL( - unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); -REGISTER_OP_CUDA_KERNEL( - unpool3d, - ops::Unpool3dKernel, - ops::Unpool3dKernel); -REGISTER_OP_CUDA_KERNEL( - unpool3d_grad, - ops::Unpool3dGradKernel, - ops::Unpool3dGradKernel); +REGISTER_OP_CUDA_KERNEL(unpool, + ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL(unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); +REGISTER_OP_CUDA_KERNEL(unpool3d, + ops::Unpool3dKernel, + ops::Unpool3dKernel); +REGISTER_OP_CUDA_KERNEL(unpool3d_grad, + ops::Unpool3dGradKernel, + ops::Unpool3dGradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc index 598595ff28b..3a98a64d858 100644 --- a/paddle/fluid/operators/unsqueeze_op.cu.cc +++ b/paddle/fluid/operators/unsqueeze_op.cu.cc @@ -19,35 +19,30 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( unsqueeze, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel>); + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel>, + ops::UnsqueezeKernel>); REGISTER_OP_CUDA_KERNEL( unsqueeze_grad, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel>, + ops::UnsqueezeGradKernel>); diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu index cd6c3a22e03..3ec89214a38 100644 --- a/paddle/fluid/platform/bfloat16_test.cu +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -67,7 +67,7 @@ TEST(bfloat16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); - CUDADeviceContext gpu_ctx(gpu_place); + phi::GPUContext gpu_ctx(gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 00b5dd7f8af..2589aa9acd0 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -41,10 +41,10 @@ class NCCLCommImpl : public NCCLComm { gpuStream_t stream() const override { return dev_ctx_->stream(); } - void set_dev_ctx(std::unique_ptr&& dev_ctx) { + void set_dev_ctx(std::unique_ptr&& dev_ctx) { dev_ctx_ = std::move(dev_ctx); } - CUDADeviceContext* dev_context() const override { return dev_ctx_.get(); } + phi::GPUContext* dev_context() const override { return dev_ctx_.get(); } gpuEvent_t compute_event() const override { return compute_event_.get(); } @@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm { int nranks_; int rank_; ncclComm_t comm_; - std::unique_ptr dev_ctx_; + std::unique_ptr dev_ctx_; // used for comm wait compute, compute_stream-->event-->comm_stream std::shared_ptr compute_event_; @@ -203,8 +203,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( NCCLComm* NCCLCommContext::AssignNCCLComm( ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) { - std::unique_ptr dev_ctx( - new CUDADeviceContext(CUDAPlace(dev_id))); + std::unique_ptr dev_ctx( + new phi::GPUContext(CUDAPlace(dev_id))); dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream()) .get()); @@ -246,7 +246,7 @@ NCCLComm* NCCLCommContext::AssignNCCLComm( comm_map_mutex_.unlock(); if (ring_id == 0) { - auto* dev_ctx = static_cast( + auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(dev_id))); dev_ctx->set_nccl_comm(comm); diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 37065960828..207496d9f46 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -62,7 +62,7 @@ class NCCLComm { virtual gpuStream_t stream() const = 0; virtual gpuEvent_t compute_event() const = 0; virtual gpuEvent_t comm_event() const = 0; - virtual CUDADeviceContext* dev_context() const = 0; + virtual phi::GPUContext* dev_context() const = 0; virtual ~NCCLComm() = default; }; diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index bfdf492962d..9f049b6e248 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -27,8 +27,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, cudaStreamCaptureMode mode, int64_t pool_id) { auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto* dev_ctx = - reinterpret_cast(mutable_dev_ctx); + auto* dev_ctx = reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); // After PR(#43206), cudnn related initializations will change to lazy mode. @@ -66,8 +65,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, std::unique_ptr EndCUDAGraphCapture() { auto place = CUDAGraph::CapturingPlace(); auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); - auto* dev_ctx = - reinterpret_cast(mutable_dev_ctx); + auto* dev_ctx = reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); dev_ctx->SetCUDAGraphAllocator(nullptr); return CUDAGraph::EndCapture(); diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h index cc76a04a769..427901c1a7f 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h @@ -621,7 +621,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_CUDA if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h index 507baf6c0f4..3628b7e0418 100644 --- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h +++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h @@ -65,8 +65,8 @@ static inline int RoundToPowerOfTwo(int n) { #ifdef WITH_NV_JETSON // The number of threads cannot be assigned 1024 in some cases when the device // is nano or tx2 . -template -inline void ChangeThreadNum(const CUDADeviceContext& context, +template +inline void ChangeThreadNum(const phi::GPUContext& context, int* num_thread, int alternative_num_thread = 512) { if (context.GetComputeCapability() == 53 || @@ -99,10 +99,9 @@ struct GpuLaunchConfig { * cuda performs better. And number of blocks should be greater (at least * 2x~4x) than number of SMs. Hence, SM count is took into account within * this function to determine the right number of threads per block. */ -inline GpuLaunchConfig GetGpuLaunchConfig1D( - const platform::CUDADeviceContext& context, - int64_t numel, - int vec_size = 1) { +inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, + int64_t numel, + int vec_size = 1) { PADDLE_ENFORCE_GE(numel, 0, platform::errors::InvalidArgument( @@ -146,8 +145,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D( return config; } -inline GpuLaunchConfig GetGpuLaunchConfig2D( - const platform::CUDADeviceContext& context, int x_dim, int y_dim) { +inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, + int x_dim, + int y_dim) { PADDLE_ENFORCE_GT( x_dim, 0, @@ -182,8 +182,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( template void LimitGridDim(const Context& ctx, dim3* grid_dim) { - auto max_grid_dim = reinterpret_cast(ctx) - .GetCUDAMaxGridDimSize(); + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2]; diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 1ce8038f0e3..a5d89f6001f 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -120,11 +120,11 @@ class NCCLGroupGuard { }; struct NCCLContext { - std::unique_ptr ctx_; + std::unique_ptr ctx_; ncclComm_t comm_; explicit NCCLContext(int dev_id) : comm_{nullptr} { - ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id))); + ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id))); ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(dev_id), ctx_->stream()) .get()); @@ -211,11 +211,9 @@ struct NCCLContextMap { NCCLContextMap(const NCCLContextMap &other) = delete; NCCLContextMap &operator=(const NCCLContextMap &other) = delete; - CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + phi::GPUContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } - CUDADeviceContext *DevCtx(platform::Place p) const { - return DevCtx(p.device); - } + phi::GPUContext *DevCtx(platform::Place p) const { return DevCtx(p.device); } const NCCLContext &at(platform::Place p) const { return this->at(p.device); } diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index ff7f64ef1be..9cb5cdfbb16 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -558,7 +558,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_HIP if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; } #endif diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index bd8d3f8a372..f91b420be0d 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -252,7 +252,7 @@ bool CUDADeviceCode::Compile(bool include_path) { } // Compile the program for specified compute_capability - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); std::vector options = {"-std=c++11", "--amdgpu-target=gfx906"}; @@ -329,7 +329,7 @@ bool CUDADeviceCode::Compile(bool include_path) { } // Compile the program for specified compute_capability - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); int compute_capability = dev_ctx->GetComputeCapability(); std::string compute_flag = @@ -416,7 +416,7 @@ void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { max_blocks, (static_cast(n) + workload_per_block - 1) / workload_per_block); - auto* dev_ctx = reinterpret_cast( + auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 113862c6ec2..d38118d2a26 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -50,17 +50,16 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto* default_dev_ctx = static_cast( + auto* default_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); - auto& desired_dev_ctx = - static_cast(dev_ctx); + auto& desired_dev_ctx = static_cast(dev_ctx); if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size, phi::Stream(reinterpret_cast( desired_dev_ctx.stream()))); } else { - return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( + return allocation::GPUContextAllocatorPool::Instance().Alloc( desired_dev_ctx, size); } #else @@ -191,11 +190,11 @@ std::unique_ptr CreateDeviceContext( auto* dev_ctx = new DevCtx(p); if (is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto* cuda_ctx = dynamic_cast(dev_ctx); + auto* cuda_ctx = dynamic_cast(dev_ctx); PADDLE_ENFORCE_NOT_NULL( cuda_ctx, platform::errors::InvalidArgument( - "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); + "Failed to dynamic_cast dev_ctx into phi::GPUContext.")); auto& instance = memory::allocation::AllocatorFacade::Instance(); if (!disable_setting_default_stream_for_allocator) { @@ -271,7 +270,7 @@ void EmplaceDeviceContexts( #endif } else if (platform::is_gpu_place(p)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - EmplaceDeviceContext( + EmplaceDeviceContext( place_to_device_context, p, disable_setting_default_stream_for_allocator); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 70b979aa9bb..6d08a0cc32b 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -271,11 +271,9 @@ struct DefaultDeviceContextType { class CudnnWorkspaceHandle; class EigenCudaStreamDevice; -using CUDADeviceContext = phi::GPUContext; - class CudnnWorkspaceHandle { public: - inline CudnnWorkspaceHandle(const CUDADeviceContext& dev_ctx, std::mutex* mtx) + inline CudnnWorkspaceHandle(const phi::GPUContext& dev_ctx, std::mutex* mtx) : device_context_(dev_ctx), mtx_(mtx) {} template @@ -318,13 +316,13 @@ class CudnnWorkspaceHandle { private: memory::allocation::AllocationPtr allocation_; - const CUDADeviceContext& device_context_; + const phi::GPUContext& device_context_; std::mutex* mtx_; }; template <> struct DefaultDeviceContextType { - using TYPE = CUDADeviceContext; + using TYPE = phi::GPUContext; }; // Currently, CUDAPinnedDeviceContext is only used to data copying. diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 2db29dc11ad..abffa1e8846 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -19,13 +19,13 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" TEST(Device, Init) { - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContext; + using phi::GPUContext; int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i)); device_context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(i), device_context->stream()) @@ -50,13 +50,13 @@ TEST(Device, Init) { } } -TEST(Device, CUDADeviceContext) { - using paddle::platform::CUDADeviceContext; +TEST(Device, GPUContext) { using paddle::platform::CUDAPlace; + using phi::GPUContext; int count = paddle::platform::GetGPUDeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); + phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i)); device_context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(CUDAPlace(i), device_context->stream()) @@ -94,10 +94,10 @@ TEST(Device, CUDADeviceContext) { TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContextPool; using paddle::platform::Place; + using phi::GPUContext; DeviceContextPool& pool = DeviceContextPool::Instance(); auto cpu_dev_ctx1 = pool.Get(CPUPlace()); diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu index efb0d9ed756..14967edbe4e 100644 --- a/paddle/fluid/platform/device_context_test_cuda_graph.cu +++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" TEST(Device, DeviceContextWithCUDAGraph) { - using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using paddle::platform::DeviceContext; using paddle::platform::DeviceContextPool; using paddle::platform::Place; + using phi::GPUContext; DeviceContextPool& pool = DeviceContextPool::Instance(); Place place = CUDAPlace(0); diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc index c9d9b6915b1..37da8daf7fd 100644 --- a/paddle/fluid/platform/device_event_gpu.cc +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -49,12 +49,11 @@ void DeviceEventCreateCUDA(DeviceEvent* event, void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) { auto* wrapper = static_cast(event->GetEvent().get()); - auto* cuda_dev_ctx = - dynamic_cast(context); + auto* cuda_dev_ctx = dynamic_cast(context); PADDLE_ENFORCE_NOT_NULL( cuda_dev_ctx, platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into CUDADeviceContext.")); + "Failed to dynamic_cast context into phi::GPUContext.")); wrapper->inner_event_.Record(cuda_dev_ctx->stream()); } @@ -78,12 +77,11 @@ void DeviceEventFinishCUDA(const DeviceEvent* event) { void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, const DeviceContext* context) { auto* wrapper = static_cast(event->GetEvent().get()); - auto* cuda_dev_ctx = - dynamic_cast(context); + auto* cuda_dev_ctx = dynamic_cast(context); PADDLE_ENFORCE_NOT_NULL( cuda_dev_ctx, platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into CUDADeviceContext.")); + "Failed to dynamic_cast context into phi::GPUContext.")); // calling cudaStreamWaitEvent(stream, event, 0) cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent()); } diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc index 9fb423e782d..7dfacc66437 100644 --- a/paddle/fluid/platform/device_event_test.cc +++ b/paddle/fluid/platform/device_event_test.cc @@ -33,8 +33,7 @@ TEST(DeviceEvent, CUDA) { auto& pool = DeviceContextPool::Instance(); auto place = CUDAPlace(0); - auto* context = - static_cast(pool.Get(place)); + auto* context = static_cast(pool.Get(place)); ASSERT_NE(context, nullptr); // case 1. test for event_creator @@ -83,8 +82,7 @@ TEST(DeviceEvent, CUDA) { auto& pool = DeviceContextPool::Instance(); auto place = CUDAPlace(0); - auto* context = - static_cast(pool.Get(place)); + auto* context = static_cast(pool.Get(place)); ASSERT_NE(context, nullptr); // case 1. test for event_creator diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index d91cb6da2dc..d6edb9ba947 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -328,7 +328,7 @@ TEST(float16, lod_tensor_on_gpu) { // CPU LoDTensor to GPU LoDTensor CUDAPlace gpu_place(0); - CUDADeviceContext gpu_ctx(gpu_place); + phi::GPUContext gpu_ctx(gpu_place); gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, gpu_ctx.stream()) .get()); diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 5e0717ba635..ce68452ffbe 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -40,9 +40,9 @@ using paddle::memory::Alloc; using paddle::memory::Copy; using paddle::platform::CPUPlace; -using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; using phi::CPUContext; +using phi::GPUContext; using paddle::platform::Transform; @@ -58,7 +58,7 @@ TEST(Transform, CPUUnary) { TEST(Transform, GPUUnary) { CUDAPlace gpu0(0); - CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -67,7 +67,7 @@ TEST(Transform, GPUUnary) { auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); float* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); @@ -89,7 +89,7 @@ TEST(Transform, CPUBinary) { TEST(Transform, GPUBinary) { int buf[4] = {1, 2, 3, 4}; CUDAPlace gpu0(0); - CUDADeviceContext ctx(gpu0); + phi::GPUContext ctx(gpu0); ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu0, ctx.stream()) .get()); @@ -97,7 +97,7 @@ TEST(Transform, GPUBinary) { auto gpu_allocation = Alloc(gpu0, sizeof(buf)); int* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index df5b2c27122..f93e9b6de92 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1257,7 +1257,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); #else - auto* context = new paddle::platform::CUDADeviceContext(place); + auto* context = new phi::GPUContext(place); context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place, context->stream()) diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h index 5d06dddd964..9b5c24abc67 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h @@ -243,9 +243,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context, grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16); } - auto stream = - reinterpret_cast(context) - .stream(); + auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { KeLstmForward(context) - .stream(); + auto stream = reinterpret_cast(context).stream(); if (batch_size == 1) { KeLstmBackward -struct GRUUnitFunctor { - static void compute(const paddle::platform::CUDADeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -93,8 +93,7 @@ struct GRUUnitFunctor { threads = dim3(32, 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -184,8 +183,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const paddle::platform::CUDADeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::GPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -236,8 +235,7 @@ struct GRUUnitGradFunctor { origin_mode); } - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, @@ -333,10 +331,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu index b2057cfc4f9..e3e8b6cc124 100644 --- a/paddle/phi/kernels/funcs/lstm_compute.cu +++ b/paddle/phi/kernels/funcs/lstm_compute.cu @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/lstm_compute.h" #include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/lstm_kernel.h" -#include "paddle/phi/kernels/funcs/lstm_compute.h" namespace phi { namespace funcs { template -struct LstmUnitFunctor { - static void compute(const paddle::platform::CUDADeviceContext& context, +struct LstmUnitFunctor { + static void compute(const phi::GPUContext& context, LstmMetaValue value, int frame_size, int batch_size, @@ -43,8 +43,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const paddle::platform::CUDADeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const phi::GPUContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, @@ -67,10 +67,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index bbd160e35c7..9f0c20ccf14 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -222,11 +222,10 @@ struct TensorSetConstantGPU { template void apply() const { - SetConstant functor; - functor( - reinterpret_cast(context_), - tensor_, - static_cast(value_)); + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, + static_cast(value_)); } const paddle::platform::DeviceContext& context_; @@ -255,8 +254,8 @@ __global__ void RowwiseAddKernel( } template -struct RowwiseAdd { - void operator()(const paddle::platform::CUDADeviceContext& context, +struct RowwiseAdd { + void operator()(const phi::GPUContext& context, const paddle::framework::Tensor& input, const paddle::framework::Tensor& vector, paddle::framework::Tensor* output) { @@ -294,18 +293,18 @@ struct RowwiseAdd { } }; -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -// template struct ColwiseSum; -// The ColwiseSum failed in debug +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug // mode, // and only failed for this case. So reimplemented it. template <> -void ColwiseSum::operator()( - const paddle::platform::CUDADeviceContext& context, +void ColwiseSum::operator()( + const phi::GPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* vector) { auto in_dims = input.dims(); @@ -320,28 +319,28 @@ void ColwiseSum::operator()( vector->numel())); paddle::framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - phi::funcs::GetBlas(context) - .GEMV(true, - static_cast(in_dims[0]), - static_cast(in_dims[1]), - 1.0, - input.data(), - one.data(), - 0.0, - vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[0]), + static_cast(in_dims[1]), + 1.0, + input.data(), + one.data(), + 0.0, + vector->data()); } -template struct RowwiseSum; -// template struct RowwiseSum; +template struct RowwiseSum; +// template struct RowwiseSum; // TODO(zcd): Following ColwiseSum format, need to confirm. -// The RowwiseSum failed in debug +// The RowwiseSum failed in debug // mode, // and only failed for this case. So reimplemented it. template <> -void RowwiseSum::operator()( - const paddle::platform::CUDADeviceContext& context, +void RowwiseSum::operator()( + const phi::GPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* vector) { auto in_dims = input.dims(); @@ -356,25 +355,25 @@ void RowwiseSum::operator()( vector->numel())); paddle::framework::Tensor one; one.mutable_data({size}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - phi::funcs::GetBlas(context) - .GEMV(true, - static_cast(in_dims[1]), - static_cast(in_dims[0]), - 1.0, - one.data(), - input.data(), - 0.0, - vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[1]), + static_cast(in_dims[0]), + 1.0, + one.data(), + input.data(), + 0.0, + vector->data()); } -template struct RowwiseMean; -template struct RowwiseMean; +template struct RowwiseMean; +template struct RowwiseMean; template -struct ElementwiseAddTo { - void operator()(paddle::platform::CUDADeviceContext* ctx, +struct ElementwiseAddTo { + void operator()(phi::GPUContext* ctx, const paddle::framework::Tensor& src, paddle::framework::Tensor* dst) { auto in = paddle::framework::EigenVector::Flatten(src); @@ -384,10 +383,8 @@ struct ElementwiseAddTo { } }; -template struct ElementwiseAddTo; -template struct ElementwiseAddTo; +template struct ElementwiseAddTo; +template struct ElementwiseAddTo; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu index a66030e6426..196ca7a2ef9 100644 --- a/paddle/phi/kernels/funcs/sequence2batch.cu +++ b/paddle/phi/kernels/funcs/sequence2batch.cu @@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const paddle::platform::CUDADeviceContext& context, + void operator()(const phi::GPUContext& context, const paddle::framework::Tensor& src, paddle::framework::Vector index_lod, paddle::framework::Tensor* dst, @@ -90,19 +90,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index e0b7bba50d6..657430e1e75 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -79,8 +79,7 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - auto* ctx = reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, input, input_width, @@ -131,9 +130,7 @@ void TopkKernel(const Context& dev_ctx, dev_ctx.template Alloc(&sorted_output); dev_ctx.template Alloc(&sorted_indices); dev_ctx.template Alloc(&gather_indices); - auto* ctx = - reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, out, k, @@ -239,8 +236,7 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - auto* ctx = reinterpret_cast( - &dev_ctx); + auto* ctx = reinterpret_cast(&dev_ctx); if (ops::SortTopk(*ctx, &trans_input, input_width, diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu index 853187fc802..479d874626a 100644 --- a/paddle/phi/tests/kernels/test_math_function.cu +++ b/paddle/phi/tests/kernels/test_math_function.cu @@ -37,9 +37,9 @@ void fill_fp16_data(phi::dtype::float16* in_ptr, } template -inline phi::funcs::BlasT GetBlas( - const paddle::platform::CUDADeviceContext& context) { - return phi::funcs::GetBlas(context); +inline phi::funcs::BlasT GetBlas( + const phi::GPUContext& context) { + return phi::funcs::GetBlas(context); } TEST(math_function, notrans_mul_trans_fp32) { @@ -51,7 +51,7 @@ TEST(math_function, notrans_mul_trans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -87,7 +87,7 @@ TEST(math_function, notrans_mul_trans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -134,7 +134,7 @@ TEST(math_function, trans_mul_notrans_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -176,7 +176,7 @@ TEST(math_function, trans_mul_notrans_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -229,7 +229,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -287,7 +287,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -364,7 +364,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -416,7 +416,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); @@ -485,7 +485,7 @@ void GemvTest(int m, int n, bool trans) { paddle::platform::CPUPlace cpu_place; paddle::platform::CUDAPlace gpu_place(0); - paddle::platform::CUDADeviceContext context(gpu_place); + phi::GPUContext context(gpu_place); context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(gpu_place, context.stream()) .get()); diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h index ffe89fde047..70919708e19 100644 --- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h +++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h @@ -62,7 +62,7 @@ struct ReluFunctor { #if defined(__NVCC__) || defined(__HIPCC__) if (paddle::platform::is_gpu_place(place)) { - LAUNCH_RELU_KERNEL(paddle::platform::CUDADeviceContext); + LAUNCH_RELU_KERNEL(phi::GPUContext); return; } #endif -- GitLab