diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index b6459d9b7069567461dbc5f40f45645039a9fd30..ac1ed668f7bf5abbd3f0a9724a2921bb8a96bb41 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -52,37 +52,37 @@ class KernelContext {
   }
 
   void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+    int index = inputs_.size();
     inputs_.emplace_back(std::move(input));
     // Record the start and end index of the input
-    int index = inputs_.size();
     input_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
   void EmplaceBackInputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+      const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs) {
+    int index = inputs_.size();
     for (auto in : inputs) {
-      inputs_.emplace_back(in);
+      inputs_.emplace_back(std::move(in));
     }
     // Record the start and end index of the input
-    int index = inputs_.size();
     input_range_.emplace_back(
         std::pair<int, int>(index, index + inputs.size()));
   }
 
   void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+    int index = outputs_.size();
     outputs_.emplace_back(std::move(output));
     // Record the start and end index of the input
-    int index = outputs_.size();
     output_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
   void EmplaceBackOutputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+      const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs) {
+    int index = outputs_.size();
     for (auto out : outputs) {
-      outputs_.emplace_back(out);
+      outputs_.emplace_back(std::move(out));
     }
     // Record the start and end index of the input
-    int index = outputs_.size();
     output_range_.emplace_back(
         std::pair<int, int>(index, index + outputs.size()));
   }
@@ -96,11 +96,40 @@ class KernelContext {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
   }
 
+  template <typename TensorType>
+  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+    std::vector<TensorType> v;
+    for (size_t i = start; i < end; ++i) {
+      auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
+      v.emplace_back(std::move(*t.get()));
+    }
+
+    return v;
+  }
+
+  const std::pair<int, int>& InputRangeAt(size_t idx) const {
+    return input_range_.at(idx);
+  }
+
+  const std::pair<int, int>& OutputRangeAt(size_t idx) const {
+    return output_range_.at(idx);
+  }
+
   template <typename TensorType>
   TensorType* MutableOutputAt(size_t idx) {
     return static_cast<TensorType*>(outputs_.at(idx).get());
   }
 
+  template <typename TensorType>
+  std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) {
+    std::vector<TensorType*> v;
+    for (size_t i = start; i < end; ++i) {
+      v.emplace_back(static_cast<TensorType*>(outputs_.at(i).get()));
+    }
+
+    return v;
+  }
+
   template <typename AttrType>
   AttrType AttrAt(size_t idx) const {
     try {
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index d3422d173a3db257b05b7b101786f7c5394dd7f0..c2b97148aa5fb1941d5bac0a9e366c70bb6f1149 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -62,9 +62,17 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
       } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
         args_def->AppendInput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<DenseTensor>&))) {
+        args_def->AppendInput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(
             default_key.backend(), default_tensor_layout, default_key.dtype());
+      } else if (arg_type ==
+                 std::type_index(typeid(std::vector<DenseTensor*>))) {
+        args_def->AppendOutput(
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index c45a81206323e96cab2d04e2df5f639681a0ab96..450202607648dbe8dd59846c3a9abc40ff38ce03 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -79,7 +79,30 @@ using XPUContext = paddle::platform::XPUDeviceContext;
                     "Kernel's Input should appear before Attributes."); \
       static_assert(out_idx == 0,                                       \
                     "Kernel's Input should appear before Outputs.");    \
-      const tensor_type& arg = ctx->InputAt<tensor_type>(in_idx);       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
+      const tensor_type& arg = ctx->InputAt<tensor_type>(range.first);  \
+      KernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
+              ctx, pargs..., arg);                                      \
+    }                                                                   \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
+  template <typename... Tail>                                           \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
+    template <int dev_ctx_idx,                                          \
+              int in_idx,                                               \
+              int attr_idx,                                             \
+              int out_idx,                                              \
+              typename... PreviousArgs>                                 \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
+      static_assert(attr_idx == 0,                                      \
+                    "Kernel's Input should appear before Attributes."); \
+      static_assert(out_idx == 0,                                       \
+                    "Kernel's Input should appear before Outputs.");    \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
+      std::vector<tensor_type> arg = std::move(                         \
+          ctx->InputBetween<tensor_type>(range.first, range.second));   \
       KernelCallHelper<Tail...>::                                       \
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
               ctx, pargs..., arg);                                      \
@@ -104,20 +127,39 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                     \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)          \
-  template <typename... Tail>                                           \
-  struct KernelCallHelper<tensor_type*, Tail...> {                      \
-    template <int dev_ctx_idx,                                          \
-              int in_idx,                                               \
-              int attr_idx,                                             \
-              int out_idx,                                              \
-              typename... PreviousArgs>                                 \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
-      tensor_type* arg = ctx->MutableOutputAt<tensor_type>(out_idx);    \
-      KernelCallHelper<Tail...>::                                       \
-          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>( \
-              ctx, pargs..., arg);                                      \
-    }                                                                   \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)           \
+  template <typename... Tail>                                            \
+  struct KernelCallHelper<tensor_type*, Tail...> {                       \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {    \
+      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);     \
+      tensor_type* arg = ctx->MutableOutputAt<tensor_type>(range.first); \
+      KernelCallHelper<Tail...>::                                        \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
+              ctx, pargs..., arg);                                       \
+    }                                                                    \
+  }
+
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)          \
+  template <typename... Tail>                                                 \
+  struct KernelCallHelper<std::vector<tensor_type*>, Tail...> {               \
+    template <int dev_ctx_idx,                                                \
+              int in_idx,                                                     \
+              int attr_idx,                                                   \
+              int out_idx,                                                    \
+              typename... PreviousArgs>                                       \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {         \
+      const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);          \
+      std::vector<tensor_type*> arg = std::move(                              \
+          ctx->MutableOutputBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                             \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(       \
+              ctx, pargs..., arg);                                            \
+    }                                                                         \
   }
 
 template <typename T>
@@ -152,6 +194,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
   // TODO(chenweihang): adapt SelectedRows
   // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
 
@@ -168,6 +211,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   /* Output Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
   // TODO(chenweihang): adapt SelectedRows
   // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
 
diff --git a/paddle/pten/hapi/lib/kernel_dispatch.h b/paddle/pten/hapi/lib/kernel_dispatch.h
index d7190076bf3f68bd2bc3bd7ab4b9ec3b4762b934..f61f3297d6d6c872b244a27319c6de7b16cabff4 100644
--- a/paddle/pten/hapi/lib/kernel_dispatch.h
+++ b/paddle/pten/hapi/lib/kernel_dispatch.h
@@ -122,6 +122,14 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     key_set.dtype = x.type();
   }
 
+  void operator()(const std::vector<Tensor>& x) {
+    key_set.backend_set =
+        key_set.backend_set | detail::GetTensorBackendSet(x[0]);
+    // TODO(chenweihang): selecte multi layout and dtype
+    key_set.layout = x[0].layout();
+    key_set.dtype = x[0].type();
+  }
+
   // skip other type args, these args don't used in kernel selection
   template <typename T>
   void operator()(const T& x) {