[Phi] Move elementwise_floordiv and elementwise_pow to phi (#40993)

* mv floordiv to phi * mv elementwise_pow to phi * fix as review

[Phi] Move elementwise_floordiv and elementwise_pow to phi (#40993)
* mv floordiv to phi * mv elementwise_pow to phi * fix as review
b532315d · wuyefeilin · GitHub · 59765362 · b532315d · 59765362
18 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -63,12 +61,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
                             ops::ElementwiseFloorDivOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
 REGISTER_OP_VERSION(elementwise_floordiv)
    .AddCheckpoint(
        R"ROC(Register elementwise_floordiv for adding the attribute of Scale_y)ROC",

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwiseFloorDivKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
-        cuda_ctx, ins, &outs, axis, FloorDivFunctor<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-void elementwise_floor_div(const framework::ExecutionContext &ctx,
-                           const framework::Tensor *x,
-                           const framework::Tensor *y, framework::Tensor *z) {
-  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, FloorDivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
-  }
-}
-template <typename DeviceContext, typename T>
-class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    // dtype of x and y is int64 or int32
-    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -49,23 +49,6 @@ using DivFunctor = phi::funcs::DivideFunctor<T>;
 template <typename T>
 using InverseDivFunctor = phi::funcs::InverseDivideFunctor<T>;
-// Floor Divide
-template <typename T>
-struct FloorDivFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
-    return static_cast<T>(std::trunc(a / b));
-  }
-};
-template <typename T>
-struct InverseFloorDivFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-    PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO);
-    return static_cast<T>(std::trunc(b / a));
-  }
-};
 #undef DIV_ERROR_INFO
 // Maximum

--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -9,8 +9,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -70,19 +68,6 @@ REGISTER_OPERATOR(elementwise_pow, ops::ElementwiseOp,
                  ops::ElementwisePowOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(elementwise_pow_grad, ops::ElementwiseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwisePowGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_VERSION(elementwise_pow)
    .AddCheckpoint(
        R"ROC(Register elementwise_pow for adding the attribute of Scale_y)ROC",

--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
-namespace ops = paddle::operators;
-namespace paddle {
-namespace operators {
-template <typename T>
-class ElementwisePowKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(cuda_ctx, ins, &outs,
-                                                      axis, PowFunctor<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwisePowGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <cmath>
-#include <type_traits>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-struct PowFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const {
-// TODO(wujionghao): A potential speed improvement is supporting different
-// types in C++.
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
-    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-    // it will return a float number like 2.99... , which floor to 2
-    // when cast to int by default and it is wrong.
-    // Use llrint to cast it to the nearest integer, which is 3.
-    if (std::is_integral<T>::value) {
-      return std::llrint(
-          std::pow(static_cast<double>(a), static_cast<double>(b)));
-    }
-#endif
-    return std::pow(a, b);
-  }
-};
-template <typename DeviceContext, typename T>
-class ElementwisePowKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::LoDTensor;
-    auto* x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x != nullptr, true,
-                      platform::errors::NotFound(
-                          "Cannot get input Variable X, Variable name = %s",
-                          ctx.InputName("X")));
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          PowFunctor<T>(), z);
-  }
-};
-template <typename T>
-struct PowGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
-    if (std::is_integral<T>::value) {
-      return dout * y *
-             std::pow(static_cast<double>(x), static_cast<double>(y - 1));
-    }
-#endif
-    return dout * y * std::pow(x, y - 1);
-  }
-};
-template <typename T, typename Enable = void>
-struct PowGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
-    if (std::is_integral<T>::value) {
-      return dout * std::log(static_cast<double>(x)) *
-             std::pow(static_cast<double>(x), static_cast<double>(y));
-    }
-#endif
-    return dout * std::log(x) * std::pow(x, y);
-  }
-};
-template <typename DeviceContext, typename T>
-class ElementwisePowGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = dout;
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, PowGradDX<T>, PowGradDY<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, PowGradDX<T>(), PowGradDY<T>());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 namespace paddle {

--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -323,3 +323,11 @@ PD_REGISTER_KERNEL(minimum_grad,
                   int,
                   int64_t,
                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(elementwise_pow_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -113,6 +113,36 @@ void ModuloRawKernel(const Context& dev_ctx,
  }
 }
+template <typename T, typename Context>
+void FloorDivideRawKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          int axis,
+                          DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  if (x_dims.size() >= y_dims.size()) {
+    funcs::ElementwiseCompute<funcs::FloorDivideFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::FloorDivideFunctor<T>(), out);
+  } else {
+    funcs::ElementwiseCompute<funcs::InverseFloorDivideFunctor<T>, T>(
+        dev_ctx, x, y, axis, funcs::InverseFloorDivideFunctor<T>(), out);
+  }
+}
+template <typename T, typename Context>
+void ElementwisePowRawKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int axis,
+                             DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::ElementwisePowFunctor<T>, T>(
+      dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
+}
 // Create the definition of Add
 DEFINE_CPU_ELEMENTWISE_OP(Add)
@@ -207,3 +237,17 @@ PD_REGISTER_KERNEL(modulo_raw,
                   double,
                   int,
                   int64_t) {}
+PD_REGISTER_KERNEL(floor_divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FloorDivideRawKernel,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -159,4 +159,13 @@ void MinimumGradKernel(const Context& dev_ctx,
                       int axis,
                       DenseTensor* dx,
                       DenseTensor* dy);
+template <typename T, typename Context>
+void ElementwisePowGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy);
 }  // namespace phi
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -81,6 +81,25 @@ void ModuloKernel(const Context& dev_ctx,
  int axis = -1;
  ModuloRawKernel<T>(dev_ctx, x, y, axis, out);
 }
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 }  // namespace phi
 using complex64 = ::phi::dtype::complex<float>;
@@ -151,6 +170,16 @@ PD_REGISTER_KERNEL(minimum,
                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(
    modulo, CPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -226,4 +255,14 @@ PD_REGISTER_KERNEL(minimum,
                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(
    modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(
+    floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 #endif
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -124,6 +124,32 @@ void ModuloKernel(const Context& dev_ctx,
                  const DenseTensor& y,
                  DenseTensor* out);
+template <typename T, typename Context>
+void FloorDivideRawKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          int axis,
+                          DenseTensor* out);
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out);
+template <typename T, typename Context>
+void ElementwisePowRawKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int axis,
+                             DenseTensor* out);
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out);
 template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                const DenseTensor& x,
@@ -200,4 +226,27 @@ DenseTensor Modulo(const Context& dev_ctx,
  ModuloKernel<T, Context>(dev_ctx, x, y, &dense_out);
  return dense_out;
 }
+template <typename T, typename Context>
+DenseTensor FloorDivide(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  FloorDivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+template <typename T, typename Context>
+DenseTensor ElementwisePow(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  ElementwisePowKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -538,5 +538,40 @@ struct InverseModuloFunctor<
    return res;
  }
 };
+template <typename T>
+struct FloorDivideFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+template <typename T>
+struct InverseFloorDivideFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO);
+    return static_cast<T>(std::trunc(b / a));
+  }
+};
+template <typename T>
+struct ElementwisePowFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+// TODO(wujionghao): A potential speed improvement is supporting different
+// types in C++.
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    // it will return a float number like 2.99... , which floor to 2
+    // when cast to int by default and it is wrong.
+    // Use llrint to cast it to the nearest integer, which is 3.
+    if (std::is_integral<T>::value) {
+      return std::llrint(
+          std::pow(static_cast<double>(a), static_cast<double>(b)));
+    }
+#endif
+    return std::pow(a, b);
+  }
+};
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -382,3 +382,11 @@ PD_REGISTER_KERNEL(minimum_grad,
                   int64_t,
                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(elementwise_pow_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu
@@ -55,6 +55,10 @@ DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
 // Create the definition of Modulo
 DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
+// Create the definition of FloorDivide
+DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+// Create the definition of Pow
+DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 }  // namespace phi
@@ -148,3 +152,17 @@ PD_REGISTER_KERNEL(modulo_raw,
                   double,
                   int,
                   int64_t) {}
+PD_REGISTER_KERNEL(floor_divide_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FloorDivideRawKernel,
+                   int,
+                   int64_t) {}
+PD_REGISTER_KERNEL(elementwise_pow_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwisePowRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -666,4 +666,44 @@ struct MinGradDy {
    return dout * static_cast<T>(x >= y);
  }
 };
+template <typename T>
+struct PowGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    if (std::is_integral<T>::value) {
+      return dout * y *
+             std::pow(static_cast<double>(x), static_cast<double>(y - 1));
+    }
+#endif
+    return dout * y * std::pow(x, y - 1);
+  }
+};
+template <typename T, typename Enable = void>
+struct PowGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+    if (std::is_integral<T>::value) {
+      return dout * std::log(static_cast<double>(x)) *
+             std::pow(static_cast<double>(x), static_cast<double>(y));
+    }
+#endif
+    return dout * std::log(x) * std::pow(x, y);
+  }
+};
+template <typename T, typename Context>
+void ElementwisePowGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, PowGradDX<T>, PowGradDY<T>>(
+      dev_ctx, x, y, dout, dout, axis, dx, dy, PowGradDX<T>(), PowGradDY<T>());
+}
 }  // namespace phi
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -82,6 +82,24 @@ KernelSignature ElementwiseModOpArgumentMapping(
  return KernelSignature("modulo_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
+KernelSignature ElementwiseFloorDivOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("floor_divide", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature("floor_divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+KernelSignature ElementwisePowOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("elementwise_pow", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature("elementwise_pow_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
 KernelSignature ElementwiseAddGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature("add_grad",
@@ -200,6 +218,13 @@ KernelSignature ElementwiseMinGradOpArgumentMapping(
                         {"axis"},
                         {GradVarName("X"), GradVarName("Y")});
 }
+KernelSignature ElementwisePowGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_pow_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"axis"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
 }  // namespace phi
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -209,6 +234,7 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_max, maximum);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_min, minimum);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_mod, modulo);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_floordiv, floor_divide);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
@@ -240,6 +266,10 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_min,
                           phi::ElementwiseMinOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_mod,
                           phi::ElementwiseModOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_floordiv,
+                           phi::ElementwiseFloorDivOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_pow,
+                           phi::ElementwisePowOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                           phi::ElementwiseAddGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
@@ -272,3 +302,5 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad,
                           phi::ElementwiseMaxGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad,
                           phi::ElementwiseMinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad,
+                           phi::ElementwisePowGradOpArgumentMapping);