Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the phi library (#40043)

* move gather.h gather.cu.h scatter.h scatter.cu.h to phi library * fix CI * fix rocm ci

Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the phi library (#40043)
* move gather.h gather.cu.h scatter.h scatter.cu.h to phi library * fix CI * fix rocm ci
09258040 · sneaxiy · GitHub · 2e6548a9 · 09258040 · 09258040
32 changed file
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -23,11 +23,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 namespace paddle {
 namespace operators {
@@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
    Tensor sorted_batch_id;
    sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
+    phi::funcs::GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
+                               &sorted_batch_id);
    Tensor batch_index_t;
    int* batch_idx_in =
@@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
        sizeof(int) * 8, dev_ctx.stream());
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
    Tensor length_lod;
    int* length_lod_data =

--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -21,7 +21,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    auto multi_layer_scores =
        context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
    int num_size = multi_rois_num.size();
    auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
    }
    num_per_batch.emplace_back(post_nms_topN - pre_idx);
    if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<Tensor>("RoisNum");
+      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
      int* rois_num_data =
          rois_num->mutable_data<int>({batch_size}, context.GetPlace());
      for (int i = 0; i < batch_size; i++) {

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -24,9 +24,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
        start = end;
        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                           dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+        phi::funcs::GPUGather<T>(dev_ctx, *fpn_rois, sub_idx,
+                                 multi_fpn_rois[i]);
      } else {
        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                           dev_ctx.GetPlace());

--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -28,10 +27,11 @@ namespace operators {
 const int kBoxDim = 4;
-inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetLodFromRoisNum(
+    const framework::Tensor* rois_num) {
  std::vector<size_t> rois_lod;
  auto* rois_num_data = rois_num->data<int>();
-  Tensor cpu_tensor;
+  framework::Tensor cpu_tensor;
  if (platform::is_gpu_place(rois_num->place())) {
    paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
                                      &cpu_tensor);
@@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    std::vector<size_t> fpn_rois_lod;
    int fpn_rois_num;
    if (context.HasInput("RoisNum")) {
-      auto* rois_num = context.Input<Tensor>("RoisNum");
+      auto* rois_num = context.Input<framework::Tensor>("RoisNum");
      fpn_rois_lod = GetLodFromRoisNum(rois_num);
    } else {
      fpn_rois_lod = fpn_rois->lod().back();
@@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    std::vector<int> num_rois_level(num_level, 0);
    std::vector<int> num_rois_level_integral(num_level + 1, 0);
    for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
      const T* rois_data = fpn_rois_slice.data<T>();
      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
@@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    std::vector<int> restore_index_inter(fpn_rois_num, -1);
    // distribute the rois into different fpn level by target level
    for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
      const T* rois_data = fpn_rois_slice.data<T>();
      size_t cur_offset = fpn_rois_lod[i];
@@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
    for (int i = 0; i < fpn_rois_num; ++i) {
      restore_index_data[restore_index_inter[i]] = i;
    }
-    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiOutput<framework::Tensor>("MultiLevelRoIsNum");
    if (multi_rois_num.size() > 0) {
      int batch_size = fpn_rois_lod.size() - 1;
      for (int i = 0; i < num_level; ++i) {

--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
  fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
  bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
  Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
  fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
  bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
  phi::funcs::set_constant(context, &bg_labels, 0);
  Concat<int>(context, fg_labels, bg_labels, sampled_labels);
  Tensor fg_max_overlap, bg_max_overlap;
  fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
  bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
  Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
@@ -334,7 +334,7 @@ std::vector<Tensor> SampleRoisForOneImage(
    } else {
      proposals_num = keep.numel();
      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
+      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
    }
    T* roi_filter_dt = roi_filter.data<T>();
    memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
    Tensor proposals;
    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    Tensor scores_filter;
    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
    if (nms_thresh <= 0) {
      return std::make_pair(bbox_sel, scores_filter);
    }
@@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
    return std::make_pair(proposals, scores_sel);
  }

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -85,8 +86,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  }
  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
  if (nms_thresh <= 0) {
    return std::make_pair(proposals_filter, scores_filter);
@@ -102,8 +103,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  Tensor scores_nms, proposals_nms;
  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
  return std::make_pair(proposals_nms, scores_nms);
 }

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
    Tensor proposals;
    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    Tensor scores_filter;
    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
    if (nms_thresh <= 0) {
      return std::make_pair(bbox_sel, scores_filter);
    }
@@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
    return std::make_pair(proposals, scores_sel);
  }

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -86,8 +87,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  }
  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
  if (nms_thresh <= 0) {
    return std::make_pair(proposals_filter, scores_filter);
@@ -104,8 +105,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
  Tensor scores_nms, proposals_nms;
  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
  return std::make_pair(proposals_nms, scores_nms);
 }

--- a/paddle/fluid/operators/gather_nd_op.cu
+++ b/paddle/fluid/operators/gather_nd_op.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 namespace paddle {
 namespace operators {
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
    if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s], but "
+        platform::errors::InvalidArgument(
-                          "desires to be [%s] or [%s].",
+            "Index holds the wrong type, it holds [%s], but "
-                          paddle::framework::DataTypeToString(index_type),
+            "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
+    auto &dev_ctx = ctx.cuda_device_context();
-                          paddle::framework::DataTypeToString(
+    if (index_type == phi::DataType::INT32) {
-                              framework::proto::VarType::INT64)));
+      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    if (index_type == framework::proto::VarType::INT32) {
+    } else if (index_type == phi::DataType::INT64) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
+      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
    }
  }
 };
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
    dxt.device(place) = dxt.constant(static_cast<T>(0));
    if (dO->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s],"
+        platform::errors::InvalidArgument(
-                          "but desires to be [%s] or [%s].",
+            "Index holds the wrong type, it holds [%s],"
-                          paddle::framework::DataTypeToString(index_type),
+            "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
+    auto &dev_ctx = ctx.cuda_device_context();
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
+    if (index_type == phi::DataType::INT32) {
-    } else if (index_type == framework::proto::VarType::INT64) {
+      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
    }
  }
 };
@@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
+                        ops::GatherNdOpCUDAKernel<double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
+                        ops::GatherNdOpCUDAKernel<int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
+                        ops::GatherNdOpCUDAKernel<int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
+                        ops::GatherNdOpCUDAKernel<int16_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int16_t>,
+                        ops::GatherNdOpCUDAKernel<bool>,
-                        ops::GatherNdOpCUDAKernel<CUDA, bool>,
+                        ops::GatherNdOpCUDAKernel<plat::float16>);
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
+REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
+                        ops::GatherNdGradOpCUDAKernel<double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
+                        ops::GatherNdGradOpCUDAKernel<int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
+                        ops::GatherNdGradOpCUDAKernel<int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
+                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
--- a/paddle/fluid/operators/gather_nd_op.h
+++ b/paddle/fluid/operators/gather_nd_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 namespace paddle {
 namespace operators {
@@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
    if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    auto index_type = index->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s],"
+        platform::errors::InvalidArgument(
-                          "but desires to be [%s] or [%s]",
+            "Index holds the wrong type, it holds [%s],"
-                          paddle::framework::DataTypeToString(index_type),
+            "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-                          paddle::framework::DataTypeToString(
+    if (index_type == phi::DataType::INT32) {
-                              framework::proto::VarType::INT64)));
+      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
-    if (index_type == framework::proto::VarType::INT32) {
+    } else if (index_type == phi::DataType::INT64) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
    }
  }
 };
@@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(
        platform::is_cpu_place(ctx.GetPlace()), true,
        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
    auto *index = ctx.Input<Tensor>("Index");
    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
    dxt.device(place) = dxt.constant(static_cast<T>(0));
    if (dO->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    auto index_type = index->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s],"
+        platform::errors::InvalidArgument(
-                          "but desires to be [%s] or [%s]",
+            "Index holds the wrong type, it holds [%s],"
-                          paddle::framework::DataTypeToString(index_type),
+            "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-                              framework::proto::VarType::INT64)));
+    if (index_type == phi::DataType::INT32) {
-    if (index_type == framework::proto::VarType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
-    } else if (index_type == framework::proto::VarType::INT64) {
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
    }
  }
 };

--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 namespace paddle {
 namespace operators {
@@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
    }
    const auto &place = ctx.GetPlace();
    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &dev_ctx = ctx.cuda_device_context();
    if (axis != 0) {
      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2CUDAFunction<T, int32_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
+                                                     dev_ctx);
      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
+                                                     dev_ctx);
      }
      return;
    }
@@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(ctx.GetPlace());
    if (x->numel() == 0) return;
    if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
    }
  }
 };
@@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
      }
    }
+    const auto &dev_ctx = ctx.cuda_device_context();
    const auto &index_type = framework::TransToProtoVarType(index->dtype());
    if (axis != 0) {
      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
+        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+                                                         dev_ctx);
      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
+        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+                                                         dev_ctx);
      }
      return;
    }
@@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
    dxt.device(place) = dxt.constant(static_cast<T>(0));
    if (dO->numel() == 0) return;
    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
+      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
+                                           ctx.Attr<bool>("overwrite"));
    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
+                                               ctx.Attr<bool>("overwrite"));
    }
  }
 };

--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 namespace paddle {
 namespace operators {
@@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel<T> {
    // get axis from tensor
    if (ctx.HasInput("Axis")) {
      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
+      const auto &axis_type = axis_tensor->dtype();
-          framework::TransToProtoVarType(axis_tensor->dtype());
+      if (axis_type == phi::DataType::INT32) {
-      if (axis_type == framework::proto::VarType::INT32) {
        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
      }
    }
-    const auto &place = ctx.GetPlace();
+    const auto &index_type = index->dtype();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        GatherV2Function<T, int32_t>(x, index, axis, output, place);
+        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
-      } else if (index_type == framework::proto::VarType::INT64) {
+                                                 output);
-        GatherV2Function<T, int64_t>(x, index, axis, output, place);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
+                                                 output);
      }
      return;
    }
    output->mutable_data<T>(ctx.GetPlace());
    if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
    }
  }
 };
@@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
    int axis = ctx.Attr<int>("axis");
    if (ctx.HasInput("Axis")) {
      const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
+      const auto &axis_type = axis_tensor->dtype();
-          framework::TransToProtoVarType(axis_tensor->dtype());
+      if (axis_type == phi::DataType::INT32) {
-      if (axis_type == framework::proto::VarType::INT32) {
        axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
        axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
      }
    }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
    if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        GatherV2GradFunction<T, int32_t>(dO, index, axis, dX, ctx.GetPlace());
+        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
-      } else if (index_type == framework::proto::VarType::INT64) {
+                                                     dX);
-        GatherV2GradFunction<T, int64_t>(dO, index, axis, dX, ctx.GetPlace());
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
+                                                     dX);
      }
      return;
    }
    dX->mutable_data<T>(ctx.GetPlace());
    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+    auto &place = *dev_ctx.eigen_device();
-                       .eigen_device();
    dxt.device(place) = dxt.constant(static_cast<T>(0));
    if (dO->numel() == 0) return;
    bool overwrite = ctx.Attr<bool>("overwrite");
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
      if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
      } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
      }
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
      if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
      }
    }
  }

--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 TEST(Gather, GatherData) {
  paddle::framework::Tensor* src = new paddle::framework::Tensor();
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
  auto* cpu_place = new paddle::platform::CPUPlace();
  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  phi::funcs::CPUGather<int>(ctx, *src, *index, output);
  delete cpu_place;
  cpu_place = NULL;
  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);

--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace paddle {
@@ -379,9 +379,9 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
      SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                     context.stream()>>>(mean_grad.data<T>(),
                                         summed_ids->data<T>(), len, dim);
-      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
    } else if (pooltype == "SUM") {
-      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
    } else {
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "

--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ b/paddle/fluid/operators/scatter_nd_add_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_nd_add_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 namespace paddle {
 namespace operators {
@@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
    auto *Out = ctx.Output<Tensor>("Out");
    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s], but "
+        platform::errors::InvalidArgument(
-                          "desires to be [%s] or [%s].",
+            "Index holds the wrong type, it holds [%s], but "
-                          paddle::framework::DataTypeToString(index_type),
+            "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
+    auto &dev_ctx = ctx.cuda_device_context();
-                          paddle::framework::DataTypeToString(
+    if (index_type == phi::DataType::INT32) {
-                              framework::proto::VarType::INT64)));
+      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
    } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
    }
  }
 };
@@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
    }
    if (dUpdates) {
      dUpdates->mutable_data<T>(ctx.GetPlace());
+      auto &dev_ctx = ctx.cuda_device_context();
      // Gradient by Gather
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+      const auto &index_type = Ids->dtype();
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
      } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
      }
    }
  }

--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ b/paddle/fluid/operators/scatter_nd_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 namespace paddle {
 namespace operators {
@@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel<T> {
    // In place output: Out = X
    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s], but "
+        platform::errors::InvalidArgument(
-                          "desires to be [%s] or [%s].",
+            "Index holds the wrong type, it holds [%s], but "
-                          paddle::framework::DataTypeToString(index_type),
+            "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
    } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
    }
  }
 };
@@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
    if (dUpdates) {
      dUpdates->mutable_data<T>(ctx.GetPlace());
      // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+      const auto &index_type = Ids->dtype();
-      if (index_type == framework::proto::VarType::INT32) {
+      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
      } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
      }
    }
  }

--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 namespace paddle {
 namespace operators {
@@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
    framework::TensorCopy(*X, ctx.GetPlace(), Out);
    // use template class to support int32_t and int64_t
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
    PADDLE_ENFORCE_EQ(
        index_type_match, true,
        platform::errors::InvalidArgument(
            "scatter_op Index holds the wrong type, it holds [%s],"
            "but desires to be [%s] or [%s].",
-            paddle::framework::DataTypeToString(index_type),
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-            paddle::framework::DataTypeToString(
+    auto &dev_ctx = ctx.cuda_device_context();
-                framework::proto::VarType::INT32),
+    if (index_type == phi::DataType::INT32) {
-            paddle::framework::DataTypeToString(
+      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
-                framework::proto::VarType::INT64)));
+                                               overwrite);
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
    } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
    }
  }
 };
@@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    auto index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
    PADDLE_ENFORCE_EQ(
        index_type_match, true,
        platform::errors::InvalidArgument(
            "scatter_op index holds the wrong type, it holds [%s],"
            "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+    auto &dev_ctx = ctx.cuda_device_context();
    if (dX) {
      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        GPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
      } else {
-        GPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
      }
    }
    if (dUpdates) {
      dUpdates->mutable_data<T>(ctx.GetPlace());
      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
      } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
      }
    }
  }

--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 namespace paddle {
 namespace operators {
@@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel<T> {
    // In place output: Out = X, Out[Ids] = Updates
    framework::TensorCopy(*X, ctx.GetPlace(), Out);
    // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index_type_match, true,
-                          "Index holds the wrong type, it holds [%s],"
+        platform::errors::InvalidArgument(
-                          "but desires to be [%s] or [%s].",
+            "Index holds the wrong type, it holds [%s],"
-                          paddle::framework::DataTypeToString(index_type),
+            "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-                              framework::proto::VarType::INT32),
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
    if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
      } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
      }
    } else {
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
      }
    }
  }
@@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
+    const auto &index_type = Ids->dtype();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+    bool index_type_match = index_type == phi::DataType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+                            index_type == phi::DataType::INT64;
    PADDLE_ENFORCE_EQ(
        index_type_match, true,
        platform::errors::InvalidArgument(
            "scatter_op index holds the wrong type, it holds [%s],"
            "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
    if (dX) {
      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        CPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
      } else {
-        CPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
      }
    }
    if (dUpdates) {
      dUpdates->mutable_data<T>(ctx.GetPlace());
      // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
+      if (index_type == phi::DataType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
      } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
      }
    }
  }

--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 #include <gtest/gtest.h>
@@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) {
  auto* cpu_place = new paddle::platform::CPUPlace();
  paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
+  phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);

--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"

--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
-#include "paddle/fluid/operators/scatter.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) {
 template <template <typename T> typename BinaryFunctor, typename T>
 struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, const Tensor& lhs,
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& rhs, Tensor* output) {
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-    std::vector<const Tensor*> ins{&lhs, &rhs};
+                  framework::Tensor* output) {
-    std::vector<Tensor*> outs{output};
+    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
+    std::vector<framework::Tensor*> outs{output};
    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
                                                   T>(dev_ctx, ins, &outs, -1,
                                                      BinaryFunctor<T>());
@@ -75,10 +76,11 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
 template <template <typename InT, typename OutT> typename CompareFunctor,
          typename T>
 struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
+  void operator()(const framework::ExecutionContext& ctx,
-                  const Tensor& rhs, Tensor* mask) {
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
-    std::vector<const Tensor*> ins = {&lhs, &rhs};
+                  framework::Tensor* mask) {
-    std::vector<Tensor*> outs = {mask};
+    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
+    std::vector<framework::Tensor*> outs = {mask};
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
        dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
@@ -131,8 +133,9 @@ struct ARange<platform::CUDADeviceContext> {
 template <typename T, typename IndType>
 struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
+  void operator()(const framework::ExecutionContext& ctx,
-                  Tensor* out_idx, Tensor* out, int axis) {
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
    framework::DDim input_dims = input.dims();
    int64_t numel = input.numel();
    int64_t groups = numel / input_dims[axis];
@@ -166,8 +169,8 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
 template <typename T>
 struct GetMaxValue<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& input, T* max_value) {
+                  const framework::Tensor& input, T* max_value) {
-    Tensor out_data;
+    framework::Tensor out_data;
    out_data.Resize(phi::make_ddim({1}));
    out_data.mutable_data<T>(platform::CUDAPlace());
    switch (ComputeBlockSize(input.numel())) {
@@ -177,7 +180,7 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
              1, input.numel(), 1, input.data<int64_t>(), nullptr,
              out_data.data<int64_t>()));
    }
-    Tensor max_value_tensor;
+    framework::Tensor max_value_tensor;
    framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
    *max_value = max_value_tensor.data<T>()[0];
  }
@@ -185,9 +188,10 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
 template <typename T, typename IndexT>
 struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor& src,
+  void operator()(const platform::CUDADeviceContext& ctx,
-                  const Tensor& index, Tensor* output) {
+                  const framework::Tensor& src, const framework::Tensor& index,
-    GPUGather<T, IndexT>(ctx, src, index, output);
+                  framework::Tensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
  }
 };

--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -28,12 +28,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using LoDTensor = framework::LoDTensor;
 template <typename DeviceContext, typename T, typename IndType>
 struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
+  void operator()(const framework::ExecutionContext& ctx,
-                  Tensor* out_idx, Tensor* out, int axis) {
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
    framework::DDim input_dims = input.dims();
    int64_t pre = 1;
    int64_t post = 1;
@@ -82,7 +81,7 @@ struct ARange {
 template <typename DeviceContext, typename T>
 struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& input,
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
                  T* max_value) {
    auto input_ptr = input.data<T>();
    auto num = input.numel();
@@ -92,14 +91,15 @@ struct GetMaxValue {
 template <typename DeviceContext, typename T, typename IndexT = int>
 struct Gather {
-  void operator()(const DeviceContext& ctx, const Tensor& src,
+  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
-                  const Tensor& index, Tensor* output) {
+                  const framework::Tensor& index, framework::Tensor* output) {
-    CPUGather<T, IndexT>(ctx, src, index, output);
+    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
  }
 };
 template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
+void SameDimsBinaryOP(const framework::Tensor& lhs,
+                      const framework::Tensor& rhs, framework::Tensor* out) {
  const T* lhs_ptr = lhs.data<T>();
  const T* rhs_ptr = rhs.data<T>();
  OutT* out_ptr = out->data<OutT>();
@@ -116,8 +116,9 @@ template <typename DeviceContext,
          template <typename InT, typename OutT> typename CompareFunctor,
          typename T>
 struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
+  void operator()(const framework::ExecutionContext& ctx,
-                  const Tensor& rhs, Tensor* mask) {
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
    SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
  }
 };
@@ -161,8 +162,9 @@ struct GetInputIndex<false> {
 };
 template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
+void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
-                             Tensor* out) {
+                             const framework::Tensor& rhs,
+                             framework::Tensor* out) {
  const T* lhs_ptr = lhs.data<T>();
  const T* rhs_ptr = rhs.data<T>();
  T* out_ptr = out->data<T>();
@@ -200,8 +202,8 @@ void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
 template <typename DeviceContext, template <typename T> typename BinaryFunctor,
          typename T>
 struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& lhs,
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
+                  const framework::Tensor& rhs, framework::Tensor* output) {
    if (lhs.dims() == rhs.dims()) {
      SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
    } else {
@@ -222,20 +224,21 @@ struct BinaryOperation {
 class TensorBuffer {
 public:
-  explicit TensorBuffer(const LoDTensor& in) : buffer_(in), offset_(0) {
+  explicit TensorBuffer(const framework::LoDTensor& in)
+      : buffer_(in), offset_(0) {
    buffer_.Resize({buffer_.numel()});
  }
-  Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
    int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                   std::multiplies<int64_t>());
-    Tensor block = buffer_.Slice(offset_, offset_ + size);
+    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
    offset_ += size;
    block.Resize(shape);
    return block;
  }
 private:
-  LoDTensor buffer_;  // need to resize 1-D Tensor
+  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
  int offset_;
 };
@@ -246,17 +249,17 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
    bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<framework::Tensor>("Input");
    auto batch_size = static_cast<int>(input->dims()[0]);
    auto seq_len = static_cast<int>(input->dims()[1]);
    auto n_labels = static_cast<int>(input->dims()[2]);
    phi::funcs::SetConstant<DeviceContext, T> float_functor;
    phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<Tensor> historys;
+    std::vector<framework::Tensor> historys;
    // We create tensor buffer in order to avoid allocating memory frequently
    // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
    int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    LoDTensor int_buffer;
+    framework::LoDTensor int_buffer;
    int_buffer.Resize(phi::make_ddim({buffer_size}));
    int_buffer.mutable_data<int64_t>(ctx.GetPlace());
    TensorBuffer int_tensor_buffer(int_buffer);
@@ -264,64 +267,78 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
    // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
    buffer_size = batch_size * (seq_len + 10) * n_labels +
                  (batch_size + 2) * n_labels * n_labels;
-    LoDTensor float_buffer;
+    framework::LoDTensor float_buffer;
    float_buffer.Resize(phi::make_ddim({buffer_size}));
    float_buffer.mutable_data<T>(ctx.GetPlace());
    TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<Tensor>("Length");
+    auto* length = ctx.Input<framework::Tensor>("Length");
-    Tensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor left_length =
+        int_tensor_buffer.GetBufferBlock({batch_size, 1});
    framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
    int64_t max_seq_len = 0;
    GetMaxValue<DeviceContext, int64_t> get_max_value;
    get_max_value(dev_ctx, left_length, &max_seq_len);
-    auto* scores = ctx.Output<Tensor>("Scores");
+    auto* scores = ctx.Output<framework::Tensor>("Scores");
    scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<Tensor>("Path");
+    auto* path = ctx.Output<framework::Tensor>("Path");
    path->Resize({batch_size, max_seq_len});
    path->mutable_data<int64_t>(curr_place);
-    Tensor tpath = int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+    framework::Tensor tpath =
+        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
    auto batch_path = Unbind(tpath);
    for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
      it->Resize({batch_size});
    }
    // create and init required tensor
-    Tensor input_exp =
+    framework::Tensor input_exp =
        float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
    TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<Tensor>("Transition");
+    auto* transition = ctx.Input<framework::Tensor>("Transition");
-    Tensor trans_exp = float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+    framework::Tensor trans_exp =
+        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
    framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
    trans_exp.Resize({1, n_labels, n_labels});
-    Tensor alpha = float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+    framework::Tensor alpha =
-    Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
    int_functor(dev_ctx, &zero, 0);
-    Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
    int_functor(dev_ctx, &one, 1);
-    Tensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor float_one =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
    float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    Tensor alpha_trn_sum =
+    framework::Tensor alpha_trn_sum =
        float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    Tensor alpha_max =
+    framework::Tensor alpha_max =
        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor alpha_argmax =
+    framework::Tensor alpha_argmax =
        int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
    auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    Tensor alpha_nxt =
+    framework::Tensor alpha_nxt =
        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor zero_len_mask =
-    Tensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+        int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor float_mask =
-    Tensor start_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    Tensor rest_trans =
+    framework::Tensor stop_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor start_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor rest_trans =
        float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor last_ids_tmp =
-    Tensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
+        int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor batch_offset =
-    std::vector<const Tensor*> shape{&rest_trans, &stop_trans, &start_trans};
+        int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<Tensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+    framework::Tensor gather_idx =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
+                                                &start_trans};
+    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
+                                            &start_trans};
    math::SplitFunctor<DeviceContext, T> split_functor;
    split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
    stop_trans.Resize({1, n_labels});
@@ -346,9 +363,9 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
    SubInt(dev_ctx, left_length, one, &left_length);
    Argmax<DeviceContext, T, int64_t> argmax;
    for (int64_t i = 1; i < max_seq_len; ++i) {
-      Tensor logit = input_exp.Slice(i, i + 1);
+      framework::Tensor logit = input_exp.Slice(i, i + 1);
      logit.Resize({batch_size, n_labels});
-      Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
      AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
      auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
      alpha_argmax_temp.Resize({batch_size, n_labels});
@@ -395,7 +412,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
      ++last_ids_index;
      AddInt(dev_ctx, left_length, one, &left_length);
      AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      Tensor& last_ids_update = batch_path[actual_len - last_ids_index];
+      framework::Tensor& last_ids_update =
+          batch_path[actual_len - last_ids_index];
      hist->Resize({batch_size * n_labels});
      gather(dev_ctx, *hist, gather_idx, &last_ids_update);
      GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,

--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/framework/tensor.h"
+// TODO(paddle-dev): move gpu_primitives.h to phi
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/utils/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-using framework::Tensor;
+namespace phi {
-using platform::DeviceContext;
+namespace funcs {
 template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
+__global__ void GatherCUDAKernel(const T* params,
-                                 T* output, size_t index_size,
+                                 const IndexT* indices,
+                                 T* output,
+                                 size_t index_size,
                                 size_t slice_size) {
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
@@ -42,9 +43,12 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
 }
 template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
+__global__ void GatherNdCUDAKernel(const T* input,
-                                   const IndexT* indices, T* output,
+                                   const int64_t* input_dims,
-                                   size_t remain_size, size_t slice_size,
+                                   const IndexT* indices,
+                                   T* output,
+                                   size_t remain_size,
+                                   size_t slice_size,
                                   size_t end_size) {
  CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
@@ -59,7 +63,8 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
          "please check whether the dimensions of index and "
          "input meet the requirements. It should "
          "be less than [%d] and greater than or equal to 0, but received [%d]",
-          input_dims[j], index_value);
+          input_dims[j],
+          index_value);
      gather_i += (index_value * temp);
      temp *= input_dims[j];
    }
@@ -76,13 +81,16 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
 * return: output tensor
 */
 template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+void GPUGather(const phi::GPUContext& ctx,
-               const Tensor& index, Tensor* output) {
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index.dims()[1],
-                          "If the index's rank of gather_op is 2,"
+        1,
-                          " the second dimension should be 1."));
+        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
+                                     " the second dimension should be 1."));
  }
  // index size
@@ -90,7 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  if (index_size == 0) return;
  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
  output_dims[0] = index_size;
  // slice size
@@ -105,18 +113,17 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  int64_t n = slice_size * index_size;
  int64_t grid = (n + block - 1) / block;
-  GatherCUDAKernel<T, IndexT><<<
+  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
      p_src, p_index, p_output, index_size, slice_size);
 }
-template <typename DeviceContext, typename T, typename IndexT = int>
+template <typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
+void GPUGatherNd(const phi::GPUContext& ctx,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
+                 const DenseTensor& input,
-  const auto& ctx = context.template device_context<DeviceContext>();
+                 const DenseTensor& index,
+                 DenseTensor* output) {
  const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();
@@ -143,29 +150,36 @@ void GPUGatherNd(const framework::ExecutionContext& context,
    v_input_dims[i] = input_dims[i];
  }
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor input_dims_tensor;
+  input_dims_tensor.Resize({input_dims_size});
+  auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
  int64_t bytes = input_dims_size * sizeof(int64_t);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_input_dims = reinterpret_cast<int64_t*>(p_input_dims->ptr());
+  paddle::memory::Copy(
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
+      gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
-               ctx.stream());
  int block = 512;
  int64_t n = slice_size * remain_numel;
  int64_t grid = (n + block - 1) / block;
-  GatherNdCUDAKernel<T, IndexT><<<
+  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
-      grid, block, 0,
+                                                                  g_input_dims,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+                                                                  p_index,
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
+                                                                  p_output,
-      end_size);
+                                                                  remain_numel,
+                                                                  slice_size,
+                                                                  end_size);
 }
 template <typename T, typename U>
-__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGPUKernel(const T* input,
-                                int64_t outer_dim_size, int64_t inner_dim_size,
+                                const U* index,
+                                T* out,
+                                int64_t outer_dim_size,
+                                int64_t inner_dim_size,
                                int64_t out_index_dim_size,
-                                int64_t input_index_dim_size, int64_t size) {
+                                int64_t input_index_dim_size,
+                                int64_t size) {
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  int64_t outer_size = outer_dim_size * out_index_dim_size;
  for (; idx < size; idx += blockDim.x * gridDim.x) {
@@ -180,7 +194,8 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
        "please check whether the dimensions of index and "
        "input meet the requirements. It should "
        "be less than [%d] and greater than or equal to 0, but received [%d]",
-        input_index_dim_size, index_val);
+        input_index_dim_size,
+        index_val);
    int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
    int64_t input_index =
@@ -191,11 +206,14 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
 }
 template <typename T, typename U>
-__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGradGPUKernel(const T* input,
+                                    const U* index,
+                                    T* out,
                                    int64_t outer_dim_size,
                                    int64_t inner_dim_size,
                                    int64_t input_index_dim_size,
-                                    int64_t out_index_dim_size, int64_t size) {
+                                    int64_t out_index_dim_size,
+                                    int64_t size) {
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < size; idx += blockDim.x * gridDim.x) {
    int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
@@ -210,10 +228,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
 }
 template <typename T, typename U>
-void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
+void GatherV2CUDAFunction(const DenseTensor* input,
-                          const int axis, Tensor* out,
+                          const DenseTensor* index,
-                          const paddle::platform::Place& place,
+                          const int axis,
-                          const framework::ExecutionContext& ctx) {
+                          DenseTensor* out,
+                          const phi::GPUContext& ctx) {
  int64_t index_size = index->numel();
  int64_t input_size = input->numel();
  auto input_dim = input->dims();
@@ -241,24 +260,31 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
  auto out_dim = phi::make_ddim(out_dim_vec);
  out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
  int64_t out_size = out->numel();
  if (out_size == 0) return;
-  platform::GpuLaunchConfig config =
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
+  auto stream = ctx.stream();
-  auto stream = ctx.cuda_device_context().stream();
  GatherGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      T,
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      index_size, index_dim_size, out_size);
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      index_size,
+      index_dim_size,
+      out_size);
 }
 template <typename T, typename U>
-void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
+void GatherV2GradCUDAFunction(const DenseTensor* input,
-                              const int axis, Tensor* out,
+                              const DenseTensor* index,
-                              const paddle::platform::Place& place,
+                              const int axis,
-                              const framework::ExecutionContext& ctx) {
+                              DenseTensor* out,
+                              const phi::GPUContext& ctx) {
  auto* index_data = index->data<U>();
  int64_t index_size = index->numel();
  int64_t input_size = input->numel();
@@ -279,19 +305,25 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
    outer_dim_size *= input_dim[i];
  }
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
  auto out_dim = out->dims();
  int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
-  platform::GpuLaunchConfig config =
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
+  auto stream = ctx.stream();
-  auto stream = ctx.cuda_device_context().stream();
  GatherGradGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      T,
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_index_dim_size, out_index_dim_size, input_size);
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      input_index_dim_size,
+      out_index_dim_size,
+      input_size);
 }
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -17,16 +17,13 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace funcs {
-using framework::Tensor;
 /**
 * A thin wrapper for gathering on cpu tensor
@@ -36,22 +33,23 @@ using framework::Tensor;
 * return: output tensor
 */
 template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+void CPUGather(const phi::CPUContext& ctx,
-               const Tensor& index, Tensor* output) {
+               const DenseTensor& src,
-  PADDLE_ENFORCE_EQ(
+               const DenseTensor& index,
-      platform::is_cpu_place(ctx.GetPlace()), true,
+               DenseTensor* output) {
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
  // check index of shape 1-D
  if (index.dims().size() == 2) {
    PADDLE_ENFORCE_EQ(
-        index.dims()[1], 1,
+        index.dims()[1],
-        platform::errors::InvalidArgument(
+        1,
+        phi::errors::InvalidArgument(
            "index.dims()[1] should be 1 when index.dims().size() = 2"
            "in gather_op, but received value is [%d].",
            index.dims()[1]));
  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
+    PADDLE_ENFORCE_EQ(index.dims().size(),
-                      platform::errors::InvalidArgument(
+                      1,
+                      phi::errors::InvalidArgument(
                          "index.dims().size() should be 1 or 2 in gather_op,"
                          "but received shape's size is [%d].",
                          index.dims().size()));
@@ -74,29 +72,32 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
  for (int64_t i = 0; i < index_size; ++i) {
    IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_LT(p_index[i], input_size,
+    PADDLE_ENFORCE_LT(p_index[i],
-                      platform::errors::OutOfRange(
+                      input_size,
+                      phi::errors::OutOfRange(
                          "The element of Index must be less than the size of "
                          "input dim size of axis which is %d, but received "
                          "index element which is %d in the %d index.",
-                          input_size, p_index[i], i));
+                          input_size,
-    PADDLE_ENFORCE_GE(p_index[i], 0,
+                          p_index[i],
-                      platform::errors::OutOfRange(
+                          i));
+    PADDLE_ENFORCE_GE(p_index[i],
+                      0,
+                      phi::errors::OutOfRange(
                          "The element of Index must be greater than or equal "
                          "to 0, but received index element which is %d in the "
                          "%d index.",
-                          p_index[i], i));
+                          p_index[i],
+                          i));
    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
  }
 }
 template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
+void CPUGatherNd(const phi::CPUContext& ctx,
-                 const Tensor& index, Tensor* output) {
+                 const DenseTensor& input,
-  PADDLE_ENFORCE_EQ(
+                 const DenseTensor& index,
-      platform::is_cpu_place(ctx.GetPlace()), true,
+                 DenseTensor* output) {
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();
  auto input_dims = input.dims();
@@ -124,25 +125,30 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
    for (int64_t j = end_size - 1; j >= 0; --j) {
      IndexT index_value = p_index[i * end_size + j];
      PADDLE_ENFORCE_LT(
-          index_value, input_dims[j],
+          index_value,
-          platform::errors::InvalidArgument(
+          input_dims[j],
+          phi::errors::InvalidArgument(
              "Input(index[-1)] has wrong value, it is [%d]", index_value));
      PADDLE_ENFORCE_GE(
-          index_value, 0,
+          index_value,
-          platform::errors::InvalidArgument(
+          0,
+          phi::errors::InvalidArgument(
              "The value of Input(index) must be no less than 0"));
      index_ += (index_value * temp);
      temp *= input_dims[j];
    }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
+    memcpy(
-           slice_bytes);
+        p_output + i * slice_size, p_input + index_ * slice_size, slice_bytes);
  }
 }
 template <typename T, typename U>
-void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
+void GatherV2Function(const phi::CPUContext& ctx,
-                      Tensor* out, const paddle::platform::Place& place) {
+                      const DenseTensor* input,
+                      const DenseTensor* index,
+                      int axis,
+                      DenseTensor* out) {
  auto* index_data = index->data<U>();
  int64_t index_size = index->numel();
  int64_t input_size = input->numel();
@@ -154,18 +160,23 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
  int64_t input_index_dim_size = input_dim[axis_index];
  for (int64_t i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
+    PADDLE_ENFORCE_LT(index_data[i],
-                      platform::errors::OutOfRange(
+                      input_index_dim_size,
+                      phi::errors::OutOfRange(
                          "The element of Index must be less than the size of "
                          "input dim size of axis which is %d, but received "
                          "index element which is %d in the %d index.",
-                          input_index_dim_size, index_data[i], i));
+                          input_index_dim_size,
-    PADDLE_ENFORCE_GE(index_data[i], 0,
+                          index_data[i],
-                      platform::errors::OutOfRange(
+                          i));
+    PADDLE_ENFORCE_GE(index_data[i],
+                      0,
+                      phi::errors::OutOfRange(
                          "The element of Index must be greater than or equal "
                          "to 0, but received index element which is %d in the "
                          "%d index.",
-                          index_data[i], i));
+                          index_data[i],
+                          i));
  }
  int64_t inner_dim_size = 1;
@@ -184,7 +195,7 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
  auto out_dim = phi::make_ddim(out_dim_vec);
  out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
  int out_index = 0;
  for (int64_t i = 0; i < inner_dim_size; i++) {
@@ -200,9 +211,11 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 }
 template <typename T, typename U>
-void GatherV2GradFunction(const Tensor* input, const Tensor* index,
+void GatherV2GradFunction(const phi::CPUContext& ctx,
-                          const int axis, Tensor* out,
+                          const DenseTensor* input,
-                          const paddle::platform::Place& place) {
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out) {
  auto* index_data = index->data<U>();
  auto input_dim = input->dims();
@@ -222,11 +235,10 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
    outer_dim_size *= input_dim[i];
  }
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
  auto out_dim = out->dims();
  int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
  for (int64_t i = 0; i < inner_dim_size; i++) {
    for (int64_t j = 0; j < input_index_dim_size; j++) {
@@ -239,5 +251,5 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
  }
 }
-}  // namespace operators
+}  // namespace funcs
-}  // namespace paddle
+}  // namespace phi
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -15,20 +15,19 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace funcs {
-using Tensor = framework::Tensor;
 template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
+__global__ void ScatterInitCUDAKernel(const IndexT* indices,
-                                      size_t index_size, size_t slice_size) {
+                                      T* output,
+                                      size_t index_size,
+                                      size_t slice_size) {
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -47,9 +46,12 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
 }
 template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
+__global__ void ScatterCUDAKernel(const T* params,
-                                  T* output, size_t index_size,
+                                  const IndexT* indices,
-                                  size_t slice_size, bool overwrite) {
+                                  T* output,
+                                  size_t index_size,
+                                  size_t slice_size,
+                                  bool overwrite) {
  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -72,9 +74,12 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
 }
 template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
+__global__ void ScatterNdCUDAKernel(const T* update,
-                                    T* output, const int64_t* output_dims,
+                                    const IndexT* indices,
-                                    size_t remain_size, size_t slice_size,
+                                    T* output,
+                                    const int64_t* output_dims,
+                                    size_t remain_size,
+                                    size_t slice_size,
                                    size_t end_size) {
  CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
    int64_t indices_i = i / slice_size;
@@ -90,7 +95,8 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
          "please check whether the dimensions of index and "
          "input meet the requirements. It should "
          "be less than [%d] and greater or equal to 0, but received [%d]",
-          output_dims[j], index_value);
+          output_dims[j],
+          index_value);
      gather_i += (index_value * temp);
      temp *= output_dims[j];
@@ -109,21 +115,24 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
 * return: output tensor
 */
 template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
+void GPUScatterAssign(const phi::GPUContext& ctx,
-                      const Tensor& src, const Tensor& index, Tensor* output,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output,
                      bool overwrite = true) {
  // check index of shape 1-D
-  const auto& ctx = context.device_context();
  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index.dims()[1],
-                          "index.dims()[1] should be 1 when "
+        1,
-                          "index.dims().size() = 2 in scatter_op."
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
-                          "But received value is [%d]",
+                                     "index.dims().size() = 2 in scatter_op."
-                          index.dims()[1]));
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
+    PADDLE_ENFORCE_EQ(index.dims().size(),
-                      platform::errors::InvalidArgument(
+                      1,
+                      phi::errors::InvalidArgument(
                          "index.dims().size() should be 1 or 2 in scatter_op."
                          "But received value is [%d]",
                          index.dims().size()));
@@ -131,7 +140,7 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
  int64_t index_size = index.dims()[0];
  auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
  output_dims[0] = index_size;
  // slice size
@@ -150,23 +159,20 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
  // if not overwrite mode, init data
  if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
+    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
        p_index, p_output, index_size, slice_size);
  }
-  ScatterCUDAKernel<T, IndexT><<<
+  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
      p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
+void GPUScatterGradForX(const phi::GPUContext& ctx,
-                        Tensor* output) {
+                        const DenseTensor& index,
+                        DenseTensor* output) {
  int64_t index_size = index.dims()[0];
  auto dst_dims = output->dims();
  // slice size
@@ -181,21 +187,18 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
  int64_t n = slice_size * index_size;
  int64_t height = (n + block - 1) / block;
-  int64_t max_grid_dimx =
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()[0];
  int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
-  ScatterInitCUDAKernel<T, IndexT><<<
+  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
      p_index, p_output, index_size, slice_size);
 }
-template <typename DeviceContext, typename T, typename IndexT = int>
+template <typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
+void GPUScatterNdAdd(const phi::GPUContext& ctx,
-                     const Tensor& update, const Tensor& index,
+                     const DenseTensor& update,
-                     Tensor* output) {
+                     const DenseTensor& index,
+                     DenseTensor* output) {
  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();
@@ -219,31 +222,34 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
  const size_t slice_bytes = slice_size * sizeof(T);
  // put output_dims int CUDA
  // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
  const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
  std::vector<int64_t> v_output_dims(output_dims_size);
  for (int i = 0; i < output_dims_size; ++i) {
    v_output_dims[i] = output_dims[i];
  }
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor out_dims_tensor;
+  out_dims_tensor.Resize({output_dims_size});
+  auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
  int64_t bytes = output_dims_size * sizeof(int64_t);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
+  paddle::memory::Copy(
-  int64_t* g_output_dims = reinterpret_cast<int64_t*>(output_dims_ptr->ptr());
+      gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
  int block = 512;
  int64_t n = slice_size * remain_numel;
  int64_t grid = (n + block - 1) / block;
-  ScatterNdCUDAKernel<T, IndexT><<<
+  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      grid, block, 0,
+      p_update,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_index,
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
+      p_output,
+      g_output_dims,
+      remain_numel,
+      slice_size,
      end_size);
 }
-}  // namespace operators
+}  // namespace funcs
-}  // namespace paddle
+}  // namespace pten
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -15,18 +15,16 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 #include <string>
+#include <unordered_set>
-#include "paddle/fluid/framework/eigen.h"
+#include "paddle/phi/common/place.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "unordered_set"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
-namespace paddle {
+namespace phi {
-namespace operators {
+namespace funcs {
-using Tensor = framework::Tensor;
 /**
  * Return the updated array pointer, use blas or eigen lib to optimize time
@@ -34,24 +32,31 @@ using Tensor = framework::Tensor;
 */
 template <typename T, typename IndexT = int>
 typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
+elementwise_inner_add(const phi::CPUContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
+                      const T* src_pointer,
-                      IndexT dst_index, size_t slice_size) {
+                      T* dst_pointer,
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+                      size_t src_index,
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  blas.VADD(slice_size,
+            src_pointer + src_index * slice_size,
            dst_pointer + dst_index * slice_size,
            dst_pointer + dst_index * slice_size);
 }
 template <typename T, typename IndexT = int>
 typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
+elementwise_inner_add(const phi::CPUContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
+                      const T* src_pointer,
-                      IndexT dst_index, size_t slice_size) {
+                      T* dst_pointer,
-  using EigenVector = typename framework::EigenTensor<T, 1>::Type;
+                      size_t src_index,
-  using ConstEigenVector = typename framework::EigenTensor<T, 1>::ConstType;
+                      IndexT dst_index,
+                      size_t slice_size) {
-  framework::EigenDim<1>::Type dim;
+  using EigenVector = typename phi::EigenTensor<T, 1>::Type;
+  using ConstEigenVector = typename phi::EigenTensor<T, 1>::ConstType;
+  phi::EigenDim<1>::Type dim;
  dim[0] = slice_size;
  ConstEigenVector eigen_src(src_pointer + src_index * slice_size, dim);
@@ -67,22 +72,23 @@ elementwise_inner_add(const framework::ExecutionContext& ctx,
 * return: output tensor
 */
 template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+void ScatterAssign(const phi::CPUContext& ctx,
-                   const Tensor& index, Tensor* output) {
+                   const DenseTensor& src,
-  PADDLE_ENFORCE_EQ(
+                   const DenseTensor& index,
-      platform::is_cpu_place(ctx.GetPlace()), true,
+                   DenseTensor* output) {
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
  // check index of shape 1-D
  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        index.dims()[1],
-                          "index.dims()[1] should be 1 when "
+        1,
-                          "index.dims().size() =2 in scatter_op."
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
-                          "But received value is [%d]",
+                                     "index.dims().size() =2 in scatter_op."
-                          index.dims()[1]));
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
  } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
+    PADDLE_ENFORCE_EQ(index.dims().size(),
-                      platform::errors::InvalidArgument(
+                      1,
+                      phi::errors::InvalidArgument(
                          "index.dims().size() should be 1 or 2 in scatter_op."
                          "But received value is [%d]",
                          index.dims().size()));
@@ -99,12 +105,16 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
  // check src shape and dst shape should match
  for (int i = 1; i < src_dims.size(); i++)
    PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
+        src_dims[i],
-        platform::errors::InvalidArgument(
+        dst_dims[i],
+        phi::errors::InvalidArgument(
            "The dimensions of the source tensor and target tensor should"
            " match, but received source tensor's %d-th dimension is %d,"
            "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
  // slice size
  size_t slice_size = 1;
@@ -115,8 +125,9 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
  for (int64_t i = 0; i < index_size; ++i) {
    IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_GE(index_, 0,
+    PADDLE_ENFORCE_GE(index_,
-                      platform::errors::OutOfRange(
+                      0,
+                      phi::errors::OutOfRange(
                          "The index is out of bounds, "
                          "please check whether the dimensions of index and "
                          "input meet the requirements. It should "
@@ -128,20 +139,20 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 }
 template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
+void ScatterAssignAdd(const phi::CPUContext& ctx,
-                      const Tensor& index, Tensor* output) {
+                      const DenseTensor& src,
-  PADDLE_ENFORCE_EQ(
+                      const DenseTensor& index,
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
+                      DenseTensor* output) {
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
  // check index of shape 1-D
  PADDLE_ENFORCE_EQ(
      index.dims().size() == 1 ||
          (index.dims().size() == 2 && index.dims()[1] == 1),
-      true, platform::errors::InvalidArgument(
+      true,
-                "index's shape is error, "
+      phi::errors::InvalidArgument(
-                "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+          "index's shape is error, "
-                "but got index'dims shape is %d",
+          "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                index.dims().size()));
+          "but got index'dims shape is %d",
+          index.dims().size()));
  int64_t index_size = index.dims()[0];
  auto src_dims = src.dims();
@@ -155,12 +166,16 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
  // check src shape and dst shape should match
  for (int i = 1; i < src_dims.size(); i++)
    PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
+        src_dims[i],
-        platform::errors::InvalidArgument(
+        dst_dims[i],
+        phi::errors::InvalidArgument(
            "The dimensions of the source tensor and target tensor should"
            " match, but received source tensor's %d-th dimension is %d,"
            "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
  // slice size
  size_t slice_size = 1;
@@ -172,36 +187,40 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
  auto max_index = dst_dims[0];
  for (int64_t i = 0; i < index_size; ++i) {
    const IndexT& index_val = p_index[i];
-    PADDLE_ENFORCE_GE(index_val, 0,
+    PADDLE_ENFORCE_GE(index_val,
-                      platform::errors::OutOfRange(
+                      0,
+                      phi::errors::OutOfRange(
                          "The index is out of bounds, "
                          "please check whether the dimensions of index and "
                          "input meet the requirements. It should "
                          "be greater than or equal to 0, but received [%d]",
                          index_val));
-    PADDLE_ENFORCE_LT(index_val, max_index,
+    PADDLE_ENFORCE_LT(index_val,
-                      platform::errors::OutOfRange(
+                      max_index,
+                      phi::errors::OutOfRange(
                          "The index is out of bounds, "
                          "please check whether the dimensions of index and "
                          "input meet the requirements. It should "
                          "be less than %d, but received %d",
-                          max_index, index_val));
+                          max_index,
+                          index_val));
    memset(p_output + slice_size * index_val, 0, slice_bytes);
  }
  // if not in overwrite mode, need to init output data
  for (int64_t i = 0; i < index_size; ++i) {
    const IndexT& index_val = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, i, index_val,
+    elementwise_inner_add<T, IndexT>(
-                                     slice_size);
+        ctx, p_src, p_output, i, index_val, slice_size);
  }
 }
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
+void CPUScatterGradForX(const phi::CPUContext& ctx,
-                        Tensor* output) {
+                        const DenseTensor& index,
+                        DenseTensor* output) {
  int64_t index_size = index.dims()[0];
  auto dst_dims = output->dims();
  const IndexT* p_index = index.data<IndexT>();
@@ -216,12 +235,10 @@ void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 }
 template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
+void ScatterNdAdd(const phi::CPUContext& ctx,
-                  const Tensor& index, Tensor* output) {
+                  const DenseTensor& update,
-  PADDLE_ENFORCE_EQ(
+                  const DenseTensor& index,
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
+                  DenseTensor* output) {
-      platform::errors::PreconditionNotMet("It should be running on the CPU"));
  // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
  auto index_dims = index.dims();
  auto index_dims_size = index_dims.size();
@@ -250,21 +267,23 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
    for (int64_t j = end_size - 1; j >= 0; --j) {
      IndexT index_value = p_index[i * end_size + j];
      PADDLE_ENFORCE_EQ(
-          (index_value >= 0 && index_value < output_dims[j]), true,
+          (index_value >= 0 && index_value < output_dims[j]),
-          platform::errors::OutOfRange(
+          true,
+          phi::errors::OutOfRange(
              "The index is out of bounds, "
              "please check whether the dimensions of index and "
              "input meet the requirements. It should "
              "be less than [%d] and greater or equal to 0, but received [%d]",
-              output_dims[j], index_value));
+              output_dims[j],
+              index_value));
      index_val += (index_value * temp);
      temp *= output_dims[j];
    }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, i, index_val,
+    elementwise_inner_add<T, IndexT>(
-                                     slice_size);
+        ctx, p_update, p_output, i, index_val, slice_size);
  }
 }
-}  // namespace operators
+}  // namespace funcs
-}  // namespace paddle
+}  // namespace phi