未验证 提交 c6478270 编写于 作者: Y YuanRisheng 提交者: GitHub

[Pten]Remove reshape and elementwise_add's registry code in Fluid (#39317)

* remove reshape and elementwise_add registry

* delete code

* fix bugs when run ci ut

* remove log

* fix bugs when run unit test

* fix bugs when run unit test

* fix bugs when run cinn

* fix bugs when run ci-mac-python3

* fix compile bugs

* fix compile bugs

* fix compile bugs

* fix bugs when run kunlun

* fix bugs when compile

* update code according comment
上级 4157579e
......@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(fill_constant);
namespace paddle {
......
......@@ -176,6 +176,6 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
}
USE_OP_ITSELF(scale);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
......@@ -189,6 +189,6 @@ USE_OP_ITSELF(scale);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
......@@ -212,6 +212,6 @@ TEST(Benchmark, FluidMLPCPU) {
} // namespace paddle
USE_OP_ITSELF(scale);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
......@@ -249,6 +249,6 @@ USE_OP_ITSELF(scale);
USE_OP(matmul_v2);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
......@@ -123,5 +123,5 @@ TEST(Generated, ElementwiseAdd) {
} // namespace egr
USE_OP(sigmoid);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(matmul_v2);
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/pten/core/kernel_factory.h"
namespace paddle {
namespace framework {
......@@ -271,25 +272,41 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
if (op_type == "c_sync_calc_stream" || op_type == "c_sync_comm_stream") {
return true;
}
auto &all_kernels = OperatorWithKernel::AllOpKernels();
auto it = all_kernels.find(op_type);
// skip op not has kernel
if (it != all_kernels.end()) {
bool support_cpu = false;
bool support_gpu = false;
for (auto &kernel_pair : it->second) {
if (platform::is_cpu_place(kernel_pair.first.place_)) {
support_cpu = true;
}
if (platform::is_gpu_place(kernel_pair.first.place_)) {
support_gpu = true;
bool support_cpu = false;
bool support_gpu = false;
auto &kernel_factory = pten::KernelFactory::Instance();
auto kernel_key_map =
kernel_factory.SelectKernelMap(pten::TransToPtenKernelName(op_type));
bool has_op_kernel = kernel_key_map.size() > 0 ? true : false;
for (auto &kernel : kernel_key_map) {
if (platform::is_gpu_place(
pten::TransToPtenPlace(kernel.first.backend()))) {
support_gpu = true;
} else if (platform::is_cpu_place(
pten::TransToPtenPlace(kernel.first.backend()))) {
support_cpu = true;
}
}
if (!support_cpu || !support_gpu) {
auto &all_kernels = OperatorWithKernel::AllOpKernels();
auto it = all_kernels.find(op_type);
// skip op not has kernel
if (it != all_kernels.end()) {
has_op_kernel = true;
for (auto &kernel_pair : it->second) {
if (platform::is_cpu_place(kernel_pair.first.place_)) {
support_cpu = true;
} else if (platform::is_gpu_place(kernel_pair.first.place_)) {
support_gpu = true;
}
}
}
VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
<< ", support GPU: " << support_gpu;
return support_cpu && support_gpu;
}
return true;
VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
<< ", support GPU: " << support_gpu;
return has_op_kernel ? (support_cpu && support_gpu) : true;
}
bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(
......
......@@ -26,7 +26,7 @@
USE_OP(mul);
USE_OP(cinn_launch);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
namespace paddle::framework {
using Name2VarInfoMap =
......
......@@ -23,8 +23,8 @@
USE_OP_ITSELF(scale);
USE_OP(elementwise_mul);
USE_OP(elementwise_add);
USE_OP(elementwise_add_grad);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad);
DECLARE_double(eager_delete_tensor_gb);
......
......@@ -29,7 +29,7 @@ USE_OP(batch_norm);
USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
USE_OP(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(gelu);
USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
......
......@@ -22,7 +22,7 @@
USE_OP(softmax);
USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(leaky_relu);
USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);
......
......@@ -67,4 +67,4 @@ TEST(NaiveExecutor, Basic) {
} // namespace framework
} // namespace paddle
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
......@@ -25,12 +25,12 @@ USE_OP(fill_constant);
USE_OP(uniform_random);
USE_OP(lookup_table);
USE_OP(transpose2);
USE_OP(reshape2);
USE_OP_ITSELF(reshape2);
USE_OP(split);
USE_OP(slice);
USE_OP(concat);
USE_OP(matmul);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(sigmoid);
USE_OP(tanh);
USE_OP(elementwise_mul);
......@@ -39,9 +39,9 @@ USE_OP(reduce_mean);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP(reduce_mean_grad);
USE_OP(reshape2_grad);
USE_OP_ITSELF(reshape2_grad);
USE_OP(softmax_with_cross_entropy_grad);
USE_OP(elementwise_add_grad);
USE_OP_ITSELF(elementwise_add_grad);
USE_OP(matmul_grad);
USE_OP(square);
USE_OP(transpose2_grad);
......
......@@ -1336,8 +1336,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
const ExecutionContext& ctx) const {
auto& dev_ctx = ctx.device_context();
auto expected_kernel_key = this->GetExpectedKernelType(ctx);
if (HasAttr("op_device")) {
if (Attr<std::string>("op_device") == "cpu") {
......@@ -1354,12 +1352,20 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
}
// when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
// will be executed and a warning will be given at the same time.
expected_kernel_key.place_ = platform::CPUPlace();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (SupportGPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
} else if (SupportNPU()) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (SupportNPU()) {
auto& dev_ctx = ctx.device_context();
expected_kernel_key.place_ = dev_ctx.GetPlace();
} else {
expected_kernel_key.place_ = platform::CPUPlace();
}
#endif
if (platform::is_cpu_place(expected_kernel_key.place_)) {
LOG_FIRST_N(WARNING, 1)
<< "Op(" << type_
<< ") has no CUDA implementation. It will be assigned to CPUPlace.";
......@@ -1934,12 +1940,10 @@ Scope* OperatorWithKernel::PreparePtenData(
for (size_t i = 0; i < input_defs.size(); ++i) {
auto& in_def = input_defs.at(i);
auto it = ctx->inputs.find(input_names[i]);
if (it == ctx->inputs.end()) {
if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) {
continue;
}
auto& ins_vector = it->second;
auto& ins_vector = ctx->inputs.at(input_names[i]);
auto& name_vec = name_map.at(input_names[i]);
bool should_skip_input =
no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
......@@ -1950,7 +1954,6 @@ Scope* OperatorWithKernel::PreparePtenData(
if (var == nullptr || !VarIsTensor(*var)) {
continue;
}
auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
// When no_buffer_ins then checking of Tensor::holder_ is
......
......@@ -661,6 +661,6 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
USE_PASS(build_cinn_pass);
USE_OP(mul);
USE_OP(relu);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(relu_grad);
USE_OP(elementwise_add_grad);
USE_OP_ITSELF(elementwise_add_grad);
......@@ -302,4 +302,4 @@ USE_PASS(build_cinn_pass);
USE_PASS(graph_viz_pass);
USE_OP(mul);
USE_OP(relu);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
......@@ -270,26 +270,26 @@ void BuildDygraphPtenKernelContext(
kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
auto end_idx = start_idx + 1;
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
} else {
auto ins_vector = it->second;
size_t end_idx = start_idx + ins_vector.size();
for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
const pten::TensorBase* tensor_in = nullptr;
auto& var = ins_vector[offset]->Var();
if (var.template IsType<pten::DenseTensor>()) {
tensor_in = &(var.template Get<pten::DenseTensor>());
} else if (var.template IsType<pten::SelectedRows>()) {
tensor_in = &(var.template Get<pten::SelectedRows>());
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported input `%s` type when call pt kernel.",
framework::ToTypeName(var.Type())));
}
kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
continue;
}
auto ins_vector = it->second;
size_t end_idx = start_idx + ins_vector.size();
for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
const pten::TensorBase* tensor_in = nullptr;
auto& var = ins_vector[offset]->Var();
if (var.template IsType<pten::DenseTensor>()) {
tensor_in = &(var.template Get<pten::DenseTensor>());
} else if (var.template IsType<pten::SelectedRows>()) {
tensor_in = &(var.template Get<pten::SelectedRows>());
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported input `%s` type when call pt kernel.",
framework::ToTypeName(var.Type())));
}
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
}
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
}
for (size_t i = 0; i < output_names.size(); ++i) {
......@@ -468,8 +468,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel,
for (size_t i = 0; i < input_names.size(); ++i) {
auto& in_def = input_defs.at(i);
auto it = ins.find(input_names[i]);
if (it == ins.end()) {
if (ins.find(input_names[i]) == ins.end()) {
continue;
}
auto& ins_vector = ins.at(input_names[i]);
......
......@@ -265,5 +265,5 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
USE_OP(mul);
USE_OP(mul_grad);
USE_OP(elementwise_add);
USE_OP(elementwise_add_grad);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(elementwise_add_grad);
......@@ -553,4 +553,4 @@ USE_OP(mul);
USE_OP(mul_grad);
USE_OP(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
......@@ -103,5 +103,5 @@ TEST(elementwise_op, plugin) {
} // namespace inference
} // namespace paddle
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(elementwise_mul);
......@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/init.h"
USE_OP(cinn_launch);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
namespace paddle {
namespace operators {
......
......@@ -32,7 +32,7 @@ limitations under the License. */
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
......
......@@ -117,55 +117,6 @@ REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad,
ops::ElementwiseTripleGradOpInplaceInferer,
ops::ElementwiseTripleGradNoBufVarsInferer);
REGISTER_OP_CPU_KERNEL(
elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
elementwise_add_grad,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
elementwise_add_grad_grad,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
double>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
int>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
elementwise_add_triple_grad,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
float>,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
double>,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
int>,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
int64_t>,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
// A specialization elementwise_add operator, used in gradient accumulation with
// inplace addto.
REGISTER_OPERATOR(
......
......@@ -18,51 +18,6 @@ limitations under the License. */
namespace ops = paddle::operators;
namespace plat = paddle::platform;
namespace paddle {
namespace operators {} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(
elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
elementwise_add_grad,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
plat::complex<float>>,
ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
plat::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
elementwise_add_grad_grad,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
plat::complex<float>>,
ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
plat::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
elementwise_add_triple_grad,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, double>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int64_t>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
plat::complex<float>>,
ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
plat::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
......
......@@ -43,73 +43,5 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
auto *x = ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y");
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
int axis = ctx.Attr<int>("axis");
pten::AddGradKernel<T>(
static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE &>(dev_ctx),
*x, *y, *dout, axis, dx, dy);
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
auto *y = ctx.Input<Tensor>("Y");
auto *dout = ctx.Input<Tensor>("DOut");
auto *ddx = ctx.Input<Tensor>("DDX");
auto *ddy = ctx.Input<Tensor>("DDY");
auto *ddout = ctx.Output<Tensor>("DDOut");
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
int axis = ctx.Attr<int>("axis");
paddle::optional<const pten::DenseTensor &> ddx_optional = paddle::none;
paddle::optional<const pten::DenseTensor &> ddy_optional = paddle::none;
if (ddx != nullptr) {
ddx_optional = *ddx;
}
if (ddy != nullptr) {
ddy_optional = *ddy;
}
pten::AddDoubleGradKernel<T>(
static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE &>(dev_ctx),
*y, ddx_optional, ddy_optional, *dout, axis, ddout);
}
};
template <typename DeviceContext, typename T>
class ElementwiseAddTripleGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
using Tensor = framework::Tensor;
auto *ddx = ctx.Input<Tensor>("DDX");
auto *ddy = ctx.Input<Tensor>("DDY");
auto *d_ddout = ctx.Input<Tensor>("D_DDOut");
auto *d_ddx = ctx.Output<Tensor>("D_DDX");
auto *d_ddy = ctx.Output<Tensor>("D_DDY");
const auto &dev_ctx = ctx.template device_context<DeviceContext>();
int axis = ctx.Attr<int>("axis");
pten::AddTripleGradKernel<T>(
static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE &>(dev_ctx),
*ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy);
}
};
} // namespace operators
} // namespace paddle
......@@ -31,7 +31,7 @@ limitations under the License. */
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_OP(elementwise_sub);
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
......
......@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
#include "paddle/fluid/platform/place.h"
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
namespace paddle {
namespace operators {
......
......@@ -22,7 +22,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
namespace paddle {
namespace operators {
......
......@@ -27,7 +27,7 @@ namespace framework = paddle::framework;
namespace platform = paddle::platform;
USE_OP(matmul);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
// get paddle matmul op results as baseline
template <typename T>
......
......@@ -25,7 +25,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(elementwise_mul);
USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);
......
......@@ -25,7 +25,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP(relu);
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
......
......@@ -18,7 +18,7 @@
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
USE_OP(elementwise_add_grad);
USE_OP_ITSELF(elementwise_add_grad);
namespace paddle {
namespace operators {
......
......@@ -657,30 +657,6 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp,
ops::ReshapeDoubleGradInplaceInferer,
ops::ReshapeDoubleGradOpNoNeedBufferVarInferer);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t,
ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel,
paddle::platform::bfloat16, ops::ReshapeKernel,
paddle::platform::complex<float>, ops::ReshapeKernel,
paddle::platform::complex<double>, ops::ReshapeKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
reshape2_grad, float, ops::ReshapeGradKernel, double,
ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool,
ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel,
paddle::platform::complex<float>, ops::ReshapeGradKernel,
paddle::platform::complex<double>, ops::ReshapeGradKernel);
REGISTER_OP_CPU_KERNEL_FUNCTOR(
reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool,
ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16,
ops::ReshapeDoubleGradKernel, paddle::platform::complex<float>,
ops::ReshapeDoubleGradKernel, paddle::platform::complex<double>,
ops::ReshapeDoubleGradKernel);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
......@@ -695,45 +671,4 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, plat::bfloat16,
ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
uint8_t, ops::ReshapeKernel, int64_t,
ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, bool, ops::ReshapeKernel,
plat::complex<float>, ops::ReshapeKernel,
plat::complex<double>, ops::ReshapeKernel,
plat::bfloat16, ops::ReshapeKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad, float, ops::ReshapeGradKernel, double,
ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
plat::bfloat16, ops::ReshapeGradKernel);
REGISTER_OP_CUDA_KERNEL_FUNCTOR(
reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel,
plat::float16, ops::ReshapeDoubleGradKernel, bool,
ops::ReshapeDoubleGradKernel, plat::complex<float>,
ops::ReshapeDoubleGradKernel, plat::complex<double>,
ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
ops::ReshapeKernel, int, ops::ReshapeKernel,
int64_t, ops::ReshapeKernel, plat::float16,
ops::ReshapeKernel, bool, ops::ReshapeKernel,
plat::complex<float>, ops::ReshapeKernel,
plat::complex<double>, ops::ReshapeKernel);
REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
double, ops::ReshapeGradKernel, int,
ops::ReshapeGradKernel, int64_t,
ops::ReshapeGradKernel, plat::float16,
ops::ReshapeGradKernel, bool,
ops::ReshapeGradKernel, plat::complex<float>,
ops::ReshapeGradKernel, plat::complex<double>,
ops::ReshapeGradKernel);
#endif
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
USE_OP(relu);
USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(softmax);
namespace paddle {
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -16,10 +16,8 @@ limitations under the License. */
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/pten/common/bfloat16.h"
#include "paddle/pten/core/kernel_registry.h"
namespace pten {
template <typename T, typename Context>
......
......@@ -75,6 +75,31 @@ KernelSignature ElementwiseAddGradOpArgumentMapping(
return KernelSignature("unregistered", {}, {}, {});
}
KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
}
KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("add_triple_grad",
{"DDX", "DDY", "D_DDOut"},
{"axis"},
{"D_DDX", "D_DDY"});
}
KernelSignature ElementwiseSubGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("X")) {
return KernelSignature("subtract_grad",
{"X", "Y", GradVarName("Out")},
{"axis"},
{GradVarName("X"), GradVarName("Y")});
}
return KernelSignature("unregistered", {}, {}, {});
}
} // namespace pten
PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
......@@ -82,6 +107,9 @@ PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
pten::ElementwiseAddOpArgumentMapping);
......@@ -93,3 +121,9 @@ PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
pten::ElementwiseDivOpArgumentMapping);
PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
pten::ElementwiseAddGradOpArgumentMapping);
PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
pten::ElementwiseAddDoubleGradOpArgumentMapping);
PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
pten::ElementwiseAddTripleGradOpArgumentMapping);
PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
pten::ElementwiseSubGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册