From 09258040e2584f4afd9114b994710232e6769970 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 2 Mar 2022 18:50:26 +0800
Subject: [PATCH] Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the phi
 library (#40043)

* move gather.h gather.cu.h scatter.h scatter.cu.h to phi library

* fix CI

* fix rocm ci
---
 .../fluid/operators/detection/bbox_util.cu.h  |   1 -
 .../detection/collect_fpn_proposals_op.cu     |  10 +-
 .../detection/collect_fpn_proposals_op.h      |   6 +-
 .../detection/distribute_fpn_proposals_op.cu  |   5 +-
 .../detection/distribute_fpn_proposals_op.h   |  15 +-
 .../detection/generate_mask_labels_op.cc      |   1 -
 .../detection/generate_proposal_labels_op.cc  |  16 +-
 .../detection/generate_proposals_op.cc        |  18 +-
 .../detection/generate_proposals_op.cu        |   9 +-
 .../detection/generate_proposals_v2_op.cc     |  18 +-
 .../detection/generate_proposals_v2_op.cu     |   9 +-
 paddle/fluid/operators/gather_nd_op.cu        |  94 +++++-----
 paddle/fluid/operators/gather_nd_op.h         |  66 ++++---
 paddle/fluid/operators/gather_op.cu           |  32 ++--
 paddle/fluid/operators/gather_op.h            |  68 +++----
 paddle/fluid/operators/gather_test.cc         |   4 +-
 paddle/fluid/operators/grid_sampler_op.h      |   1 -
 .../fluid/operators/math/segment_pooling.cu   |   6 +-
 paddle/fluid/operators/scatter_nd_add_op.cu   |  41 ++--
 paddle/fluid/operators/scatter_nd_add_op.h    |  41 ++--
 paddle/fluid/operators/scatter_op.cu          |  50 +++--
 paddle/fluid/operators/scatter_op.h           |  63 +++----
 paddle/fluid/operators/scatter_test.cc        |   4 +-
 paddle/fluid/operators/segment_pool_op.cu     |   1 -
 .../sequence_ops/sequence_scatter_op.cc       |   2 -
 .../sequence_ops/sequence_scatter_op.h        |   3 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |  38 ++--
 paddle/fluid/operators/viterbi_decode_op.h    | 128 +++++++------
 .../kernels/funcs}/gather.cu.h                | 176 +++++++++++-------
 .../operators => phi/kernels/funcs}/gather.h  | 114 +++++++-----
 .../kernels/funcs}/scatter.cu.h               | 124 ++++++------
 .../operators => phi/kernels/funcs}/scatter.h | 165 ++++++++--------
 32 files changed, 702 insertions(+), 627 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.cu.h (62%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.h (72%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.cu.h (67%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.h (65%)

diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index b361bc3ab7..f170fbbe4b 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ce9ac3de4e..860fdd0179 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -23,11 +23,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
     Tensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
+    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
+    phi::funcs::GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
+                               &sorted_batch_id);
 
     Tensor batch_index_t;
     int* batch_idx_in =
@@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
         sizeof(int) * 8, dev_ctx.stream());
 
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
     Tensor length_lod;
     int* length_lod_data =
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index a60f881ebf..e5ae9a6ccb 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -21,7 +21,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
     int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
     num_per_batch.emplace_back(post_nms_topN - pre_idx);
     if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<Tensor>("RoisNum");
+      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
       int* rois_num_data =
           rois_num->mutable_data<int>({batch_size}, context.GetPlace());
       for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index c117fbd70f..7ad25e003b 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -24,9 +24,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         start = end;
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+        phi::funcs::GPUGather<T>(dev_ctx, *fpn_rois, sub_idx,
+                                 multi_fpn_rois[i]);
       } else {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 628cbcd761..5479e08c2a 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -28,10 +27,11 @@ namespace operators {
 
 const int kBoxDim = 4;
 
-inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetLodFromRoisNum(
+    const framework::Tensor* rois_num) {
   std::vector<size_t> rois_lod;
   auto* rois_num_data = rois_num->data<int>();
-  Tensor cpu_tensor;
+  framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(rois_num->place())) {
     paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
                                       &cpu_tensor);
@@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<size_t> fpn_rois_lod;
     int fpn_rois_num;
     if (context.HasInput("RoisNum")) {
-      auto* rois_num = context.Input<Tensor>("RoisNum");
+      auto* rois_num = context.Input<framework::Tensor>("RoisNum");
       fpn_rois_lod = GetLodFromRoisNum(rois_num);
     } else {
       fpn_rois_lod = fpn_rois->lod().back();
@@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> num_rois_level(num_level, 0);
     std::vector<int> num_rois_level_integral(num_level + 1, 0);
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
@@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> restore_index_inter(fpn_rois_num, -1);
     // distribute the rois into different fpn level by target level
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       size_t cur_offset = fpn_rois_lod[i];
@@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
-    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiOutput<framework::Tensor>("MultiLevelRoIsNum");
     if (multi_rois_num.size() > 0) {
       int batch_size = fpn_rois_lod.size() - 1;
       for (int i = 0; i < num_level; ++i) {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index e6af1a5bbf..c9cc4e7220 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 424aa07144..cbf1704840 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
   Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
   Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
   Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
 
@@ -334,7 +334,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     } else {
       proposals_num = keep.numel();
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
+      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
     }
     T* roi_filter_dt = roi_filter.data<T>();
     memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 8c4bd4ac61..d613082327 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6e3c322c17..5fb7973fd8 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -85,8 +86,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -102,8 +103,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 6351ea865c..1f1802574c 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 93ba3deca5..005309e8ee 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -86,8 +87,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -104,8 +105,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
index 0de2798bf7..338c441161 100644
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ b/paddle/fluid/operators/gather_nd_op.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
 
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
@@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int16_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, bool>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
+                        ops::GatherNdOpCUDAKernel<double>,
+                        ops::GatherNdOpCUDAKernel<int64_t>,
+                        ops::GatherNdOpCUDAKernel<int>,
+                        ops::GatherNdOpCUDAKernel<int16_t>,
+                        ops::GatherNdOpCUDAKernel<bool>,
+                        ops::GatherNdOpCUDAKernel<plat::float16>);
 
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
+                        ops::GatherNdGradOpCUDAKernel<double>,
+                        ops::GatherNdGradOpCUDAKernel<int64_t>,
+                        ops::GatherNdGradOpCUDAKernel<int>,
+                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
index f458c0e180..d54261008e 100644
--- a/paddle/fluid/operators/gather_nd_op.h
+++ b/paddle/fluid/operators/gather_nd_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+
     auto *index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index a502a13040..8f1d9284c5 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     }
     const auto &place = ctx.GetPlace();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &dev_ctx = ctx.cuda_device_context();
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2CUDAFunction<T, int32_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
+                                                     dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
+    const auto &dev_ctx = ctx.cuda_device_context();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       }
       return;
     }
@@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
+                                           ctx.Attr<bool>("overwrite"));
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
+                                               ctx.Attr<bool>("overwrite"));
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 016c2b398d..94de694b2f 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel<T> {
     // get axis from tensor
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2Function<T, int32_t>(x, index, axis, output, place);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2Function<T, int64_t>(x, index, axis, output, place);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
+                                                 output);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
+                                                 output);
       }
       return;
     }
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradFunction<T, int32_t>(dO, index, axis, dX, ctx.GetPlace());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradFunction<T, int64_t>(dO, index, axis, dX, ctx.GetPlace());
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
+                                                     dX);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
+                                                     dX);
       }
       return;
     }
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto &place = *dev_ctx.eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
       if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
       }
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
       if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
       }
     }
   }
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 0f3dcdadcf..c962dd0652 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  phi::funcs::CPUGather<int>(ctx, *src, *index, output);
   delete cpu_place;
   cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 8f3c6660f5..93e9669427 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index bb6d8756bd..fbdcb99c02 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -379,9 +379,9 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
       SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                      context.stream()>>>(mean_grad.data<T>(),
                                          summed_ids->data<T>(), len, dim);
-      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
index 6448f8cc40..2fe3fcb759 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ b/paddle/fluid/operators/scatter_nd_add_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_nd_add_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
     auto *Out = ctx.Output<Tensor>("Out");
 
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
     }
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
+      auto &dev_ctx = ctx.cuda_device_context();
       // Gradient by Gather
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
index 2bdf9ec58a..81c95fe55a 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ b/paddle/fluid/operators/scatter_nd_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel<T> {
 
     // In place output: Out = X
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index 549e30803b..7755e376bc 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // use template class to support int32_t and int64_t
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op Index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s].",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     }
   }
 };
@@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.cuda_device_context();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        GPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 69ab6c7135..7733181a93 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = X, Out[Ids] = Updates
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     }
   }
@@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        CPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 0a4cab5fac..93f2d60e5f 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 #include <gtest/gtest.h>
 
@@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
+  phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index 4e20844dc3..e147e62a98 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 2d4730635f..25c12ab565 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
index 365381abc4..2960b77d5a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 3c546dd815..68628fb274 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) {
 
 template <template <typename T> typename BinaryFunctor, typename T>
 struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
-    std::vector<const Tensor*> ins{&lhs, &rhs};
-    std::vector<Tensor*> outs{output};
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* output) {
+    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
+    std::vector<framework::Tensor*> outs{output};
     paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
                                                    T>(dev_ctx, ins, &outs, -1,
                                                       BinaryFunctor<T>());
@@ -75,10 +76,11 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
 template <template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
-    std::vector<const Tensor*> ins = {&lhs, &rhs};
-    std::vector<Tensor*> outs = {mask};
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
+    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
+    std::vector<framework::Tensor*> outs = {mask};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
         dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
@@ -131,8 +133,9 @@ struct ARange<platform::CUDADeviceContext> {
 
 template <typename T, typename IndType>
 struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t numel = input.numel();
     int64_t groups = numel / input_dims[axis];
@@ -166,8 +169,8 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
 template <typename T>
 struct GetMaxValue<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& input, T* max_value) {
-    Tensor out_data;
+                  const framework::Tensor& input, T* max_value) {
+    framework::Tensor out_data;
     out_data.Resize(phi::make_ddim({1}));
     out_data.mutable_data<T>(platform::CUDAPlace());
     switch (ComputeBlockSize(input.numel())) {
@@ -177,7 +180,7 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
               1, input.numel(), 1, input.data<int64_t>(), nullptr,
               out_data.data<int64_t>()));
     }
-    Tensor max_value_tensor;
+    framework::Tensor max_value_tensor;
     framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
     *max_value = max_value_tensor.data<T>()[0];
   }
@@ -185,9 +188,10 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
 
 template <typename T, typename IndexT>
 struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    GPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& src, const framework::Tensor& index,
+                  framework::Tensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index bf12a03e7b..0974177e6c 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -28,12 +28,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = framework::LoDTensor;
-
 template <typename DeviceContext, typename T, typename IndType>
 struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t pre = 1;
     int64_t post = 1;
@@ -82,7 +81,7 @@ struct ARange {
 
 template <typename DeviceContext, typename T>
 struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& input,
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
                   T* max_value) {
     auto input_ptr = input.data<T>();
     auto num = input.numel();
@@ -92,14 +91,15 @@ struct GetMaxValue {
 
 template <typename DeviceContext, typename T, typename IndexT = int>
 struct Gather {
-  void operator()(const DeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    CPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
+                  const framework::Tensor& index, framework::Tensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
 template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
+void SameDimsBinaryOP(const framework::Tensor& lhs,
+                      const framework::Tensor& rhs, framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   OutT* out_ptr = out->data<OutT>();
@@ -116,8 +116,9 @@ template <typename DeviceContext,
           template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
     SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
   }
 };
@@ -161,8 +162,9 @@ struct GetInputIndex<false> {
 };
 
 template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
-                             Tensor* out) {
+void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
+                             const framework::Tensor& rhs,
+                             framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   T* out_ptr = out->data<T>();
@@ -200,8 +202,8 @@ void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
 template <typename DeviceContext, template <typename T> typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
+                  const framework::Tensor& rhs, framework::Tensor* output) {
     if (lhs.dims() == rhs.dims()) {
       SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
     } else {
@@ -222,20 +224,21 @@ struct BinaryOperation {
 
 class TensorBuffer {
  public:
-  explicit TensorBuffer(const LoDTensor& in) : buffer_(in), offset_(0) {
+  explicit TensorBuffer(const framework::LoDTensor& in)
+      : buffer_(in), offset_(0) {
     buffer_.Resize({buffer_.numel()});
   }
-  Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
     int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                    std::multiplies<int64_t>());
-    Tensor block = buffer_.Slice(offset_, offset_ + size);
+    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
     offset_ += size;
     block.Resize(shape);
     return block;
   }
 
  private:
-  LoDTensor buffer_;  // need to resize 1-D Tensor
+  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
   int offset_;
 };
 
@@ -246,17 +249,17 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<framework::Tensor>("Input");
     auto batch_size = static_cast<int>(input->dims()[0]);
     auto seq_len = static_cast<int>(input->dims()[1]);
     auto n_labels = static_cast<int>(input->dims()[2]);
     phi::funcs::SetConstant<DeviceContext, T> float_functor;
     phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<Tensor> historys;
+    std::vector<framework::Tensor> historys;
     // We create tensor buffer in order to avoid allocating memory frequently
     // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
     int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    LoDTensor int_buffer;
+    framework::LoDTensor int_buffer;
     int_buffer.Resize(phi::make_ddim({buffer_size}));
     int_buffer.mutable_data<int64_t>(ctx.GetPlace());
     TensorBuffer int_tensor_buffer(int_buffer);
@@ -264,64 +267,78 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
     buffer_size = batch_size * (seq_len + 10) * n_labels +
                   (batch_size + 2) * n_labels * n_labels;
-    LoDTensor float_buffer;
+    framework::LoDTensor float_buffer;
     float_buffer.Resize(phi::make_ddim({buffer_size}));
     float_buffer.mutable_data<T>(ctx.GetPlace());
     TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<Tensor>("Length");
-    Tensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    auto* length = ctx.Input<framework::Tensor>("Length");
+    framework::Tensor left_length =
+        int_tensor_buffer.GetBufferBlock({batch_size, 1});
     framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
     int64_t max_seq_len = 0;
     GetMaxValue<DeviceContext, int64_t> get_max_value;
     get_max_value(dev_ctx, left_length, &max_seq_len);
 
-    auto* scores = ctx.Output<Tensor>("Scores");
+    auto* scores = ctx.Output<framework::Tensor>("Scores");
     scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<Tensor>("Path");
+    auto* path = ctx.Output<framework::Tensor>("Path");
     path->Resize({batch_size, max_seq_len});
     path->mutable_data<int64_t>(curr_place);
-    Tensor tpath = int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+    framework::Tensor tpath =
+        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
     auto batch_path = Unbind(tpath);
     for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
       it->Resize({batch_size});
     }
     // create and init required tensor
-    Tensor input_exp =
+    framework::Tensor input_exp =
         float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<Tensor>("Transition");
-    Tensor trans_exp = float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+    auto* transition = ctx.Input<framework::Tensor>("Transition");
+    framework::Tensor trans_exp =
+        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
     framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
     trans_exp.Resize({1, n_labels, n_labels});
-    Tensor alpha = float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor alpha =
+        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &zero, 0);
-    Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &one, 1);
-    Tensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor float_one =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
     float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    Tensor alpha_trn_sum =
+    framework::Tensor alpha_trn_sum =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    Tensor alpha_max =
+    framework::Tensor alpha_max =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor alpha_argmax =
+    framework::Tensor alpha_argmax =
         int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    Tensor alpha_nxt =
+    framework::Tensor alpha_nxt =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    Tensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor start_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor rest_trans =
+    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor zero_len_mask =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor float_mask =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor stop_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor start_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor rest_trans =
         float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const Tensor*> shape{&rest_trans, &stop_trans, &start_trans};
-    std::vector<Tensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor last_ids_tmp =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor batch_offset =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor gather_idx =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
+                                                &start_trans};
+    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
+                                            &start_trans};
     math::SplitFunctor<DeviceContext, T> split_functor;
     split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
     stop_trans.Resize({1, n_labels});
@@ -346,9 +363,9 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     SubInt(dev_ctx, left_length, one, &left_length);
     Argmax<DeviceContext, T, int64_t> argmax;
     for (int64_t i = 1; i < max_seq_len; ++i) {
-      Tensor logit = input_exp.Slice(i, i + 1);
+      framework::Tensor logit = input_exp.Slice(i, i + 1);
       logit.Resize({batch_size, n_labels});
-      Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
       AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
       auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
       alpha_argmax_temp.Resize({batch_size, n_labels});
@@ -395,7 +412,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
       ++last_ids_index;
       AddInt(dev_ctx, left_length, one, &left_length);
       AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      Tensor& last_ids_update = batch_path[actual_len - last_ids_index];
+      framework::Tensor& last_ids_update =
+          batch_path[actual_len - last_ids_index];
       hist->Resize({batch_size * n_labels});
       gather(dev_ctx, *hist, gather_idx, &last_ids_update);
       GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
similarity index 62%
rename from paddle/fluid/operators/gather.cu.h
rename to paddle/phi/kernels/funcs/gather.cu.h
index fef425c53a..6e31ab7f8c 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/memory/memcpy.h"
+// TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/utils/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
 
-using framework::Tensor;
-using platform::DeviceContext;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
+__global__ void GatherCUDAKernel(const T* params,
+                                 const IndexT* indices,
+                                 T* output,
+                                 size_t index_size,
                                  size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -42,9 +43,12 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
+__global__ void GatherNdCUDAKernel(const T* input,
+                                   const int64_t* input_dims,
+                                   const IndexT* indices,
+                                   T* output,
+                                   size_t remain_size,
+                                   size_t slice_size,
                                    size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -59,7 +63,8 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater than or equal to 0, but received [%d]",
-          input_dims[j], index_value);
+          input_dims[j],
+          index_value);
       gather_i += (index_value * temp);
       temp *= input_dims[j];
     }
@@ -76,13 +81,16 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "If the index's rank of gather_op is 2,"
-                          " the second dimension should be 1."));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
+                                     " the second dimension should be 1."));
   }
 
   // index size
@@ -90,7 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   if (index_size == 0) return;
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -105,18 +113,17 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   int64_t n = slice_size * index_size;
   int64_t grid = (n + block - 1) / block;
 
-  GatherCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
+template <typename T, typename IndexT = int>
+void GPUGatherNd(const phi::GPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -143,29 +150,36 @@ void GPUGatherNd(const framework::ExecutionContext& context,
     v_input_dims[i] = input_dims[i];
   }
 
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor input_dims_tensor;
+  input_dims_tensor.Resize({input_dims_size});
+  auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
   int64_t bytes = input_dims_size * sizeof(int64_t);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_input_dims = reinterpret_cast<int64_t*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
+
+  paddle::memory::Copy(
+      gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
+  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
+                                                                  g_input_dims,
+                                                                  p_index,
+                                                                  p_output,
+                                                                  remain_numel,
+                                                                  slice_size,
+                                                                  end_size);
 }
 
 template <typename T, typename U>
-__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
-                                int64_t outer_dim_size, int64_t inner_dim_size,
+__global__ void GatherGPUKernel(const T* input,
+                                const U* index,
+                                T* out,
+                                int64_t outer_dim_size,
+                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
-                                int64_t input_index_dim_size, int64_t size) {
+                                int64_t input_index_dim_size,
+                                int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
@@ -180,7 +194,8 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
         "please check whether the dimensions of index and "
         "input meet the requirements. It should "
         "be less than [%d] and greater than or equal to 0, but received [%d]",
-        input_index_dim_size, index_val);
+        input_index_dim_size,
+        index_val);
 
     int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int64_t input_index =
@@ -191,11 +206,14 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGradGPUKernel(const T* input,
+                                    const U* index,
+                                    T* out,
                                     int64_t outer_dim_size,
                                     int64_t inner_dim_size,
                                     int64_t input_index_dim_size,
-                                    int64_t out_index_dim_size, int64_t size) {
+                                    int64_t out_index_dim_size,
+                                    int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
     int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
@@ -210,10 +228,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place,
-                          const framework::ExecutionContext& ctx) {
+void GatherV2CUDAFunction(const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out,
+                          const phi::GPUContext& ctx) {
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
   auto input_dim = input->dims();
@@ -241,24 +260,31 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  auto stream = ctx.stream();
   GatherGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      index_size, index_dim_size, out_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      index_size,
+      index_dim_size,
+      out_size);
 }
 
 template <typename T, typename U>
-void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
-                              const int axis, Tensor* out,
-                              const paddle::platform::Place& place,
-                              const framework::ExecutionContext& ctx) {
+void GatherV2GradCUDAFunction(const DenseTensor* input,
+                              const DenseTensor* index,
+                              const int axis,
+                              DenseTensor* out,
+                              const phi::GPUContext& ctx) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -279,19 +305,25 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
+  auto stream = ctx.stream();
   GatherGradGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      input_index_dim_size, out_index_dim_size, input_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      input_index_dim_size,
+      out_index_dim_size,
+      input_size);
 }
-}  // namespace operators
-}  // namespace paddle
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather.h b/paddle/phi/kernels/funcs/gather.h
similarity index 72%
rename from paddle/fluid/operators/gather.h
rename to paddle/phi/kernels/funcs/gather.h
index 46f78b16ef..740042c999 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -17,16 +17,13 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
  * A thin wrapper for gathering on cpu tensor
@@ -36,22 +33,23 @@ using framework::Tensor;
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
+void CPUGather(const phi::CPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
     PADDLE_ENFORCE_EQ(
-        index.dims()[1], 1,
-        platform::errors::InvalidArgument(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument(
             "index.dims()[1] should be 1 when index.dims().size() = 2"
             "in gather_op, but received value is [%d].",
             index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in gather_op,"
                           "but received shape's size is [%d].",
                           index.dims().size()));
@@ -74,29 +72,32 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
 
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_LT(p_index[i], input_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(p_index[i],
+                      input_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_size, p_index[i], i));
-    PADDLE_ENFORCE_GE(p_index[i], 0,
-                      platform::errors::OutOfRange(
+                          input_size,
+                          p_index[i],
+                          i));
+    PADDLE_ENFORCE_GE(p_index[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          p_index[i], i));
+                          p_index[i],
+                          i));
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
-
+void CPUGatherNd(const phi::CPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
   auto input_dims = input.dims();
@@ -124,25 +125,30 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_LT(
-          index_value, input_dims[j],
-          platform::errors::InvalidArgument(
+          index_value,
+          input_dims[j],
+          phi::errors::InvalidArgument(
               "Input(index[-1)] has wrong value, it is [%d]", index_value));
       PADDLE_ENFORCE_GE(
-          index_value, 0,
-          platform::errors::InvalidArgument(
+          index_value,
+          0,
+          phi::errors::InvalidArgument(
               "The value of Input(index) must be no less than 0"));
 
       index_ += (index_value * temp);
       temp *= input_dims[j];
     }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
+    memcpy(
+        p_output + i * slice_size, p_input + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename U>
-void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
-                      Tensor* out, const paddle::platform::Place& place) {
+void GatherV2Function(const phi::CPUContext& ctx,
+                      const DenseTensor* input,
+                      const DenseTensor* index,
+                      int axis,
+                      DenseTensor* out) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -154,18 +160,23 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 
   int64_t input_index_dim_size = input_dim[axis_index];
   for (int64_t i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_data[i],
+                      input_index_dim_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_index_dim_size, index_data[i], i));
-    PADDLE_ENFORCE_GE(index_data[i], 0,
-                      platform::errors::OutOfRange(
+                          input_index_dim_size,
+                          index_data[i],
+                          i));
+    PADDLE_ENFORCE_GE(index_data[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          index_data[i], i));
+                          index_data[i],
+                          i));
   }
 
   int64_t inner_dim_size = 1;
@@ -184,7 +195,7 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
 
   int out_index = 0;
   for (int64_t i = 0; i < inner_dim_size; i++) {
@@ -200,9 +211,11 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 }
 
 template <typename T, typename U>
-void GatherV2GradFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place) {
+void GatherV2GradFunction(const phi::CPUContext& ctx,
+                          const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out) {
   auto* index_data = index->data<U>();
 
   auto input_dim = input->dims();
@@ -222,11 +235,10 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
@@ -239,5 +251,5 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
similarity index 67%
rename from paddle/fluid/operators/scatter.cu.h
rename to paddle/phi/kernels/funcs/scatter.cu.h
index 2fea08516d..f87e8c882c 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,20 +15,19 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size) {
+__global__ void ScatterInitCUDAKernel(const IndexT* indices,
+                                      T* output,
+                                      size_t index_size,
+                                      size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -47,9 +46,12 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
+__global__ void ScatterCUDAKernel(const T* params,
+                                  const IndexT* indices,
+                                  T* output,
+                                  size_t index_size,
+                                  size_t slice_size,
+                                  bool overwrite) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -72,9 +74,12 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int64_t* output_dims,
-                                    size_t remain_size, size_t slice_size,
+__global__ void ScatterNdCUDAKernel(const T* update,
+                                    const IndexT* indices,
+                                    T* output,
+                                    const int64_t* output_dims,
+                                    size_t remain_size,
+                                    size_t slice_size,
                                     size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -90,7 +95,8 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater or equal to 0, but received [%d]",
-          output_dims[j], index_value);
+          output_dims[j],
+          index_value);
 
       gather_i += (index_value * temp);
       temp *= output_dims[j];
@@ -109,21 +115,24 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
+void GPUScatterAssign(const phi::GPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output,
                       bool overwrite = true) {
   // check index of shape 1-D
-  const auto& ctx = context.device_context();
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() = 2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() = 2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -131,7 +140,7 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -150,23 +159,20 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
 
   // if not overwrite mode, init data
   if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_index, p_output, index_size, slice_size);
   }
 
-  ScatterCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void GPUScatterGradForX(const phi::GPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   // slice size
@@ -181,21 +187,18 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
   int64_t n = slice_size * index_size;
   int64_t height = (n + block - 1) / block;
 
-  int64_t max_grid_dimx =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()[0];
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
-  ScatterInitCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
+template <typename T, typename IndexT = int>
+void GPUScatterNdAdd(const phi::GPUContext& ctx,
+                     const DenseTensor& update,
+                     const DenseTensor& index,
+                     DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
 
@@ -219,31 +222,34 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
   const size_t slice_bytes = slice_size * sizeof(T);
   // put output_dims int CUDA
   // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   std::vector<int64_t> v_output_dims(output_dims_size);
   for (int i = 0; i < output_dims_size; ++i) {
     v_output_dims[i] = output_dims[i];
   }
-  auto& dev_ctx = context.cuda_device_context();
+
+  phi::DenseTensor out_dims_tensor;
+  out_dims_tensor.Resize({output_dims_size});
+  auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
   int64_t bytes = output_dims_size * sizeof(int64_t);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_output_dims = reinterpret_cast<int64_t*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
+  paddle::memory::Copy(
+      gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
+  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+      p_update,
+      p_index,
+      p_output,
+      g_output_dims,
+      remain_numel,
+      slice_size,
       end_size);
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/scatter.h b/paddle/phi/kernels/funcs/scatter.h
similarity index 65%
rename from paddle/fluid/operators/scatter.h
rename to paddle/phi/kernels/funcs/scatter.h
index eae82fcd01..5d15c955a7 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -15,18 +15,16 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 #include <string>
+#include <unordered_set>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "unordered_set"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
   * Return the updated array pointer, use blas or eigen lib to optimize time
@@ -34,24 +32,31 @@ using Tensor = framework::Tensor;
  */
 template <typename T, typename IndexT = int>
 typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  blas.VADD(slice_size,
+            src_pointer + src_index * slice_size,
             dst_pointer + dst_index * slice_size,
             dst_pointer + dst_index * slice_size);
 }
 
 template <typename T, typename IndexT = int>
 typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  using EigenVector = typename framework::EigenTensor<T, 1>::Type;
-  using ConstEigenVector = typename framework::EigenTensor<T, 1>::ConstType;
-
-  framework::EigenDim<1>::Type dim;
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  using EigenVector = typename phi::EigenTensor<T, 1>::Type;
+  using ConstEigenVector = typename phi::EigenTensor<T, 1>::ConstType;
+
+  phi::EigenDim<1>::Type dim;
   dim[0] = slice_size;
 
   ConstEigenVector eigen_src(src_pointer + src_index * slice_size, dim);
@@ -67,22 +72,23 @@ elementwise_inner_add(const framework::ExecutionContext& ctx,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssign(const phi::CPUContext& ctx,
+                   const DenseTensor& src,
+                   const DenseTensor& index,
+                   DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() =2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() =2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -99,12 +105,16 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -115,8 +125,9 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
 
-    PADDLE_ENFORCE_GE(index_, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
@@ -128,20 +139,20 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssignAdd(const phi::CPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output) {
   // check index of shape 1-D
   PADDLE_ENFORCE_EQ(
       index.dims().size() == 1 ||
           (index.dims().size() == 2 && index.dims()[1] == 1),
-      true, platform::errors::InvalidArgument(
-                "index's shape is error, "
-                "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                "but got index'dims shape is %d",
-                index.dims().size()));
+      true,
+      phi::errors::InvalidArgument(
+          "index's shape is error, "
+          "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+          "but got index'dims shape is %d",
+          index.dims().size()));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
@@ -155,12 +166,16 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -172,36 +187,40 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   auto max_index = dst_dims[0];
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    PADDLE_ENFORCE_GE(index_val, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_val,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be greater than or equal to 0, but received [%d]",
                           index_val));
-    PADDLE_ENFORCE_LT(index_val, max_index,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_val,
+                      max_index,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be less than %d, but received %d",
-                          max_index, index_val));
+                          max_index,
+                          index_val));
     memset(p_output + slice_size * index_val, 0, slice_bytes);
   }
 
   // if not in overwrite mode, need to init output data
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_src, p_output, i, index_val, slice_size);
   }
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void CPUScatterGradForX(const phi::CPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   const IndexT* p_index = index.data<IndexT>();
@@ -216,12 +235,10 @@ void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU"));
-
+void ScatterNdAdd(const phi::CPUContext& ctx,
+                  const DenseTensor& update,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
   // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -250,21 +267,23 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_EQ(
-          (index_value >= 0 && index_value < output_dims[j]), true,
-          platform::errors::OutOfRange(
+          (index_value >= 0 && index_value < output_dims[j]),
+          true,
+          phi::errors::OutOfRange(
               "The index is out of bounds, "
               "please check whether the dimensions of index and "
               "input meet the requirements. It should "
               "be less than [%d] and greater or equal to 0, but received [%d]",
-              output_dims[j], index_value));
+              output_dims[j],
+              index_value));
 
       index_val += (index_value * temp);
       temp *= output_dims[j];
     }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_update, p_output, i, index_val, slice_size);
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
-- 
GitLab