diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 07d2a0f6b727aa56ef804e5ca9dee8e7a86e2cdb..643ef52e87bdaff0d531a68922077a8877830a9f 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(fill_constant); namespace paddle { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 176a02d896384f90226eb196436a9a41670852a7..8aa6b7b8460749911a9f7187564aa1195006b537 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -176,6 +176,6 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { } USE_OP_ITSELF(scale); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index d2bc05f41b532238c688960087dba6ce1281331f..53d97b2919a5bf6b1a7b0c99b3ed46b5f70b27ef 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -189,6 +189,6 @@ USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index c2f0479460064e05fc917ec432a7384e43e73cf3..0b2585905d3eda09b2565812f918949ed7f2ffba 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -212,6 +212,6 @@ TEST(Benchmark, FluidMLPCPU) { } // namespace paddle USE_OP_ITSELF(scale); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index 250005e31150c3c9d83d3d094ccb4e00b2de7429..9cebb73a34a7ff6541a499bdd4f36997034f4bf1 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -249,6 +249,6 @@ USE_OP_ITSELF(scale); USE_OP(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 5b95b43edea82b8beac9c46fe81651784f608274..e3bdba05e97365fb177e6130d5ceaab9f7838529 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -123,5 +123,5 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr USE_OP(sigmoid); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(matmul_v2); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index 619976d45fb0d9675e09046f2fad8fc3bbf5d90a..b56c9cb13ccdc2dd1c7a1dfcd1aad6da27590cae 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/pten/core/kernel_factory.h" namespace paddle { namespace framework { @@ -271,25 +272,41 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU( if (op_type == "c_sync_calc_stream" || op_type == "c_sync_comm_stream") { return true; } - auto &all_kernels = OperatorWithKernel::AllOpKernels(); - auto it = all_kernels.find(op_type); - // skip op not has kernel - if (it != all_kernels.end()) { - bool support_cpu = false; - bool support_gpu = false; - for (auto &kernel_pair : it->second) { - if (platform::is_cpu_place(kernel_pair.first.place_)) { - support_cpu = true; - } - if (platform::is_gpu_place(kernel_pair.first.place_)) { - support_gpu = true; + bool support_cpu = false; + bool support_gpu = false; + auto &kernel_factory = pten::KernelFactory::Instance(); + auto kernel_key_map = + kernel_factory.SelectKernelMap(pten::TransToPtenKernelName(op_type)); + bool has_op_kernel = kernel_key_map.size() > 0 ? true : false; + for (auto &kernel : kernel_key_map) { + if (platform::is_gpu_place( + pten::TransToPtenPlace(kernel.first.backend()))) { + support_gpu = true; + } else if (platform::is_cpu_place( + pten::TransToPtenPlace(kernel.first.backend()))) { + support_cpu = true; + } + } + + if (!support_cpu || !support_gpu) { + auto &all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + // skip op not has kernel + if (it != all_kernels.end()) { + has_op_kernel = true; + for (auto &kernel_pair : it->second) { + if (platform::is_cpu_place(kernel_pair.first.place_)) { + support_cpu = true; + } else if (platform::is_gpu_place(kernel_pair.first.place_)) { + support_gpu = true; + } } } - VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu - << ", support GPU: " << support_gpu; - return support_cpu && support_gpu; } - return true; + + VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu + << ", support GPU: " << support_gpu; + return has_op_kernel ? (support_cpu && support_gpu) : true; } bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck( diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc index abed6a5bd4bc48e01d9bcf20abf1bed236ed847a..ed9f6230720f83100e641068c8664d643b6db260 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc @@ -26,7 +26,7 @@ USE_OP(mul); USE_OP(cinn_launch); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle::framework { using Name2VarInfoMap = diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index 746d90cef917cdb8c4740adf7dff3438c2ca1249..d33dc7f49feb0f4c9e585d13186d65b6c2d618c0 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -23,8 +23,8 @@ USE_OP_ITSELF(scale); USE_OP(elementwise_mul); -USE_OP(elementwise_add); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add); +USE_OP_ITSELF(elementwise_add_grad); DECLARE_double(eager_delete_tensor_gb); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 5f819ddbfaf8b88732b35119014c34644a1c402b..96aa95bde337436dd6eb584b3eea5395b5301a34 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -29,7 +29,7 @@ USE_OP(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 90dc7801131074868073e1307ae7bfc51f2c3631..ea335e9bd63c624310df2f092b13e30a9458bb93 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -22,7 +22,7 @@ USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc index c917630666b082ab7148550707f9f1f720aa25d3..2f3c3f3d06e327bc583c817bdfcc78345d8adff5 100644 --- a/paddle/fluid/framework/naive_executor_test.cc +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -67,4 +67,4 @@ TEST(NaiveExecutor, Basic) { } // namespace framework } // namespace paddle -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index b42f2da2a4d78b2913aedd01172771ce51926a2a..a0708f28e37ee2088d82f1b73b79f1452dc0f262 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -25,12 +25,12 @@ USE_OP(fill_constant); USE_OP(uniform_random); USE_OP(lookup_table); USE_OP(transpose2); -USE_OP(reshape2); +USE_OP_ITSELF(reshape2); USE_OP(split); USE_OP(slice); USE_OP(concat); USE_OP(matmul); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); USE_OP(tanh); USE_OP(elementwise_mul); @@ -39,9 +39,9 @@ USE_OP(reduce_mean); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); USE_OP(reduce_mean_grad); -USE_OP(reshape2_grad); +USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 7c13fa90f9bbc528393bc2607481bc43ca1b6397..7ab4e2acecfccd913343fc453338a26ddd9c92dd 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1336,8 +1336,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( const ExecutionContext& ctx) const { - auto& dev_ctx = ctx.device_context(); - auto expected_kernel_key = this->GetExpectedKernelType(ctx); if (HasAttr("op_device")) { if (Attr("op_device") == "cpu") { @@ -1354,12 +1352,20 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( } // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel // will be executed and a warning will be given at the same time. + expected_kernel_key.place_ = platform::CPUPlace(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (SupportGPU()) { + auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); - } else if (SupportNPU()) { + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (SupportNPU()) { + auto& dev_ctx = ctx.device_context(); expected_kernel_key.place_ = dev_ctx.GetPlace(); - } else { - expected_kernel_key.place_ = platform::CPUPlace(); + } +#endif + if (platform::is_cpu_place(expected_kernel_key.place_)) { LOG_FIRST_N(WARNING, 1) << "Op(" << type_ << ") has no CUDA implementation. It will be assigned to CPUPlace."; @@ -1934,12 +1940,10 @@ Scope* OperatorWithKernel::PreparePtenData( for (size_t i = 0; i < input_defs.size(); ++i) { auto& in_def = input_defs.at(i); - auto it = ctx->inputs.find(input_names[i]); - if (it == ctx->inputs.end()) { + if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) { continue; } - - auto& ins_vector = it->second; + auto& ins_vector = ctx->inputs.at(input_names[i]); auto& name_vec = name_map.at(input_names[i]); bool should_skip_input = no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0; @@ -1950,7 +1954,6 @@ Scope* OperatorWithKernel::PreparePtenData( if (var == nullptr || !VarIsTensor(*var)) { continue; } - auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); // When no_buffer_ins then checking of Tensor::holder_ is diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bca6a0a4cb8e0d61574f2b7be00e1f67b70ec035..79e6da987ef09db5ed43dfb8168dd13fa0cf885e 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -661,6 +661,6 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(relu_grad); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index be51c7b783a797032c32b668cd83a4a0b83048b2..05cd9e8a2e8a0d9fb533d9b92b7e1c9d7742629b 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -302,4 +302,4 @@ USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 465fc2fca138ef06f057c69eae2a3419136c1e72..9a4b197685ae152ab401fc56693a7a8363e2b75c 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -270,26 +270,26 @@ void BuildDygraphPtenKernelContext( kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); auto end_idx = start_idx + 1; kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - } else { - auto ins_vector = it->second; - size_t end_idx = start_idx + ins_vector.size(); - - for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - const pten::TensorBase* tensor_in = nullptr; - auto& var = ins_vector[offset]->Var(); - if (var.template IsType()) { - tensor_in = &(var.template Get()); - } else if (var.template IsType()) { - tensor_in = &(var.template Get()); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported input `%s` type when call pt kernel.", - framework::ToTypeName(var.Type()))); - } - kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + continue; + } + auto ins_vector = it->second; + size_t end_idx = start_idx + ins_vector.size(); + + for (size_t offset = 0; offset < ins_vector.size(); ++offset) { + const pten::TensorBase* tensor_in = nullptr; + auto& var = ins_vector[offset]->Var(); + if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else if (var.template IsType()) { + tensor_in = &(var.template Get()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported input `%s` type when call pt kernel.", + framework::ToTypeName(var.Type()))); } - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < output_names.size(); ++i) { @@ -468,8 +468,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel, for (size_t i = 0; i < input_names.size(); ++i) { auto& in_def = input_defs.at(i); - auto it = ins.find(input_names[i]); - if (it == ins.end()) { + if (ins.find(input_names[i]) == ins.end()) { continue; } auto& ins_vector = ins.at(input_names[i]); diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3a0bb7c52bfe2eabb9e769cfd6c8d436df4a87e3..c99dbf1cf6258dd3bb1fbdd753b37adfb2736f14 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -265,5 +265,5 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) { USE_OP(mul); USE_OP(mul_grad); -USE_OP(elementwise_add); -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add); +USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index ff3331be56c3abe886496df95039c85073ed4777..e26cacb894836812a4f5e99ae469a95a959cf736 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -553,4 +553,4 @@ USE_OP(mul); USE_OP(mul_grad); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc index 17adf957f64a76a010da6160479be2125d9deac9..d14317712b579b8f04889c3a18e4231d96513225 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -103,5 +103,5 @@ TEST(elementwise_op, plugin) { } // namespace inference } // namespace paddle -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(elementwise_mul); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 849cdb715049ba235f737117e0769ec0a9105942..b4cd91ea8a4bce6f8a2bbeb01d15f03cb5053de7 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" USE_OP(cinn_launch); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc index 9d27d99b3ab35835330e629f21502d05d635103a..199e2b6bc7fc6cb3ec82c550058c8df14980fc01 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -32,7 +32,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index f462c2ea0720b600f238109704e9606a2f7d627c..53037c1fa653648044e2dc0981ec5c63351e7c15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -117,55 +117,6 @@ REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, ops::ElementwiseTripleGradOpInplaceInferer, ops::ElementwiseTripleGradNoBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel>, - ops::ElementwiseAddKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel>, - ops::ElementwiseAddGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_grad_grad, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel>, - ops::ElementwiseAddDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_add_triple_grad, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel>, - ops::ElementwiseAddTripleGradKernel>); - // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. REGISTER_OPERATOR( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2326aa561eaa05986c6e58bc1f2f2c93334cf893..b66cd01349d1ecb76307a6d6a24cf9b08d69cfb4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -18,51 +18,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -namespace paddle { -namespace operators {} // namespace operators -} // namespace paddle -REGISTER_OP_CUDA_KERNEL( - elementwise_add, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel>, - ops::ElementwiseAddKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel>, - ops::ElementwiseAddGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_grad_grad, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel>, - ops::ElementwiseAddDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_add_triple_grad, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel, - ops::ElementwiseAddTripleGradKernel>, - ops::ElementwiseAddTripleGradKernel>); - REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 73415d3fdb5c83cac1c0a8afb67548d7fa09b3c3..6f2a1fe87d70913f3699ead365e53923a7eaf83d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -43,73 +43,5 @@ class ElementwiseAddKernel : public framework::OpKernel { } }; -template -class ElementwiseAddGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dy = ctx.Output(framework::GradVarName("Y")); - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - pten::AddGradKernel( - static_cast::TYPE &>(dev_ctx), - *x, *y, *dout, axis, dx, dy); - } -}; - -template -class ElementwiseAddDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - - auto *y = ctx.Input("Y"); - auto *dout = ctx.Input("DOut"); - auto *ddx = ctx.Input("DDX"); - auto *ddy = ctx.Input("DDY"); - - auto *ddout = ctx.Output("DDOut"); - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - paddle::optional ddx_optional = paddle::none; - paddle::optional ddy_optional = paddle::none; - if (ddx != nullptr) { - ddx_optional = *ddx; - } - if (ddy != nullptr) { - ddy_optional = *ddy; - } - pten::AddDoubleGradKernel( - static_cast::TYPE &>(dev_ctx), - *y, ddx_optional, ddy_optional, *dout, axis, ddout); - } -}; - -template -class ElementwiseAddTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using Tensor = framework::Tensor; - auto *ddx = ctx.Input("DDX"); - auto *ddy = ctx.Input("DDY"); - auto *d_ddout = ctx.Input("D_DDOut"); - auto *d_ddx = ctx.Output("D_DDX"); - auto *d_ddy = ctx.Output("D_DDY"); - - const auto &dev_ctx = ctx.template device_context(); - int axis = ctx.Attr("axis"); - pten::AddTripleGradKernel( - static_cast::TYPE &>(dev_ctx), - *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 63ec5bd4a2805e74b8a6552a53ac65fb55a0cdf5..4732762624a5f820698d228fb105529d845af049 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -31,7 +31,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); USE_OP(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 12d82654362ac125502a1b4b73c34226647ec99e..7efa1d24dcf1fe3c62d3177321e4c5e98e8f267d 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 706475bc82fadef0eaf864d69fe3ceccb087d6f2..e1340de2096e08bcfc8d3010a87d56be869c749e 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 551d8ee6592dfcf39e15b5d5c3b40453847fb64d..94a6ba3139b1d700cfb7f3ce2cd02424da3f63bb 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -27,7 +27,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; USE_OP(matmul); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); // get paddle matmul op results as baseline template diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 7251653793f89900efa5382db74201a1fc232574..7bd2eb5c5eba6733c2c52f745b28fa4230d12b64 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -25,7 +25,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 0612417c46ce30a73ce0cbc582be740023ff0ab6..6be0e703e564ceb397ea90c810f4018388b2838e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -25,7 +25,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index 7c1cf9109c566625743f69de8cf3213855600c69..b96fcaa486cce8099cf1d03c7d948ea74c1923ad 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -18,7 +18,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" -USE_OP(elementwise_add_grad); +USE_OP_ITSELF(elementwise_add_grad); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 77c4a2005e3bf71c46b24e75d8c929507d2ca8a0..74095d2ce4e657f247f49818d9280295c68d5247 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -657,30 +657,6 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp, ops::ReshapeDoubleGradInplaceInferer, ops::ReshapeDoubleGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t, - ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel, - paddle::platform::bfloat16, ops::ReshapeKernel, - paddle::platform::complex, ops::ReshapeKernel, - paddle::platform::complex, ops::ReshapeKernel); - -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2_grad, float, ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, - ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool, - ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel, - paddle::platform::complex, ops::ReshapeGradKernel, - paddle::platform::complex, ops::ReshapeGradKernel); -REGISTER_OP_CPU_KERNEL_FUNCTOR( - reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, - ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, - ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool, - ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16, - ops::ReshapeDoubleGradKernel, paddle::platform::complex, - ops::ReshapeDoubleGradKernel, paddle::platform::complex, - ops::ReshapeDoubleGradKernel); - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, @@ -695,45 +671,4 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel, plat::bfloat16, ops::ReshapeGradKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - uint8_t, ops::ReshapeKernel, int64_t, - ops::ReshapeKernel, plat::float16, - ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::bfloat16, ops::ReshapeKernel); -REGISTER_OP_CUDA_KERNEL_FUNCTOR( - reshape2_grad, float, ops::ReshapeGradKernel, double, - ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, - ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16, - ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel, plat::complex, ops::ReshapeGradKernel, - plat::bfloat16, ops::ReshapeGradKernel); - -REGISTER_OP_CUDA_KERNEL_FUNCTOR( - reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, - ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, - ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, - plat::float16, ops::ReshapeDoubleGradKernel, bool, - ops::ReshapeDoubleGradKernel, plat::complex, - ops::ReshapeDoubleGradKernel, plat::complex, - ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel); -#endif - -#ifdef PADDLE_WITH_XPU -REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, - ops::ReshapeKernel, int, ops::ReshapeKernel, - int64_t, ops::ReshapeKernel, plat::float16, - ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel, - plat::complex, ops::ReshapeKernel); -REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, - double, ops::ReshapeGradKernel, int, - ops::ReshapeGradKernel, int64_t, - ops::ReshapeGradKernel, plat::float16, - ops::ReshapeGradKernel, bool, - ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel, plat::complex, - ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index 60eeb66ae7d1eca6e093432bfdc4e5f12f47f2e9..29ba5bcc1b5bb27528ee01bbf85208978cb4f97c 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" USE_OP(relu); -USE_OP(elementwise_add); +USE_OP_ITSELF(elementwise_add); USE_OP(softmax); namespace paddle { diff --git a/paddle/pten/kernels/selected_rows/scale_kernel.cc b/paddle/pten/kernels/selected_rows/scale_kernel.cc index 09700d8afe0508e51cbdaff8404d97c4e25f5b9d..32f7a41a5b9688710450713a4b96c68906d26ad5 100644 --- a/paddle/pten/kernels/selected_rows/scale_kernel.cc +++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,10 +16,8 @@ limitations under the License. */ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/core/kernel_registry.h" - -// See Note [ Why still include the fluid headers? ] #include "paddle/pten/common/bfloat16.h" +#include "paddle/pten/core/kernel_registry.h" namespace pten { template diff --git a/paddle/pten/ops/compat/elementwise_sig.cc b/paddle/pten/ops/compat/elementwise_sig.cc index c1941f6dde30baca60c3647ca0e2267c8a0d65f1..6541334ee27ec21d92ebcab67af1186bafadbfb2 100644 --- a/paddle/pten/ops/compat/elementwise_sig.cc +++ b/paddle/pten/ops/compat/elementwise_sig.cc @@ -75,6 +75,31 @@ KernelSignature ElementwiseAddGradOpArgumentMapping( return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ElementwiseAddDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); +} + +KernelSignature ElementwiseAddTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("add_triple_grad", + {"DDX", "DDY", "D_DDOut"}, + {"axis"}, + {"D_DDX", "D_DDY"}); +} + +KernelSignature ElementwiseSubGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + return KernelSignature("subtract_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + } // namespace pten PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -82,6 +107,9 @@ PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract); PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply); PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide); PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad); +PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); PT_REGISTER_ARG_MAPPING_FN(elementwise_add, pten::ElementwiseAddOpArgumentMapping); @@ -93,3 +121,9 @@ PT_REGISTER_ARG_MAPPING_FN(elementwise_div, pten::ElementwiseDivOpArgumentMapping); PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad, pten::ElementwiseAddGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad, + pten::ElementwiseAddDoubleGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad, + pten::ElementwiseAddTripleGradOpArgumentMapping); +PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad, + pten::ElementwiseSubGradOpArgumentMapping);