diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
index 237cb3bd3d7d1757c4502e0418c6b83957ee8bf8..5127c5934cb48dcbca484ec61917b1e8092202f7 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -131,6 +131,21 @@ void Activation::operator()() {
   // Add links for activation op.
   activation_op->LinksFrom({activation_input}).LinksTo({activation_out});
 }
+
+void FusedTokenPrune::operator()() {
+  // Create nodes for fused_token_prune.
+  auto* fused_token_prune_input =
+      pattern->NewNode(fused_token_prune_input_repr())
+          ->assert_is_op_input("fused_token_prune", "X");
+  auto* fused_token_prune_op = pattern->NewNode(fused_token_prune_op_repr())
+                                   ->assert_is_op("fused_token_prune");
+  auto* fused_token_prune_output =
+      pattern->NewNode(fused_token_prune_output_repr())
+          ->assert_is_op_output("fused_token_prune", "SlimmedX");
+
+  fused_token_prune_op->LinksFrom({fused_token_prune_input})
+      .LinksTo({fused_token_prune_output});
+}
 }  // namespace patterns
 
 void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
@@ -563,6 +578,48 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
   };
   gpd6(graph, handler6);
 
+  GraphPatternDetector gpd7;
+  patterns::FusedTokenPrune fused_token_prune(
+      gpd7.mutable_pattern(), "remove_padding_recover_padding_pass");
+  fused_token_prune();
+
+  auto handler7 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "fused_token_prune";
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fused_token_prune_input, fused_token_prune_input, fused_token_prune);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fused_token_prune_op, fused_token_prune_op, fused_token_prune);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        fused_token_prune_output, fused_token_prune_output, fused_token_prune);
+
+    std::vector<int64_t> fused_token_prune_input_shape =
+        fused_token_prune_input->Var()->GetShape();
+    check_flag = true;
+    if (fused_token_prune_input_shape.size() !=
+        multihead_matmul_input_shape.size()) {
+      check_flag = false;
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    for (size_t i = 0; i < fused_token_prune_input_shape.size(); ++i) {
+      if (fused_token_prune_input_shape[i] != multihead_matmul_input_shape[i]) {
+        check_flag = false;
+      }
+    }
+    if (!check_flag) {
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+    insert_recover_padding_op(fused_token_prune_op, fused_token_prune_output);
+    found_subgraph_count++;
+  };
+  gpd7(graph, handler7);
+
   AddStatis(found_subgraph_count);
 }
 
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
index f93ee4bc7c4ff872c655fdf249c82878225c13d8..ff04dc55323ab4970a51d3b2e1ca834230cacfdc 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -95,7 +95,6 @@ struct Fc : public PatternBase {
 
   PATTERN_DECL_NODE(fc_input);
   PATTERN_DECL_NODE(fc_op);
-  PATTERN_DECL_NODE(fc_out);
 };
 
 struct Activation : public PatternBase {
@@ -108,6 +107,17 @@ struct Activation : public PatternBase {
   PATTERN_DECL_NODE(activation_op);
   PATTERN_DECL_NODE(activation_out);
 };
+
+struct FusedTokenPrune : public PatternBase {
+  FusedTokenPrune(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "fused_token_prune") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(fused_token_prune_input);
+  PATTERN_DECL_NODE(fused_token_prune_op);
+  PATTERN_DECL_NODE(fused_token_prune_output);
+};
 }  // namespace patterns
 
 class RemovePaddingRecoverPaddingPass : public FusePassBase {
diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
index dba0d003c0fe554a5176a3ec26059b21009a2a4b..4832b1fad1826c035885a0be95993ebaa5227d66 100644
--- a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
@@ -52,8 +52,21 @@ class FusedTokenPruneOpConverter : public OpConverter {
         auto* word_id = engine_->GetITensor("word_id");
         auto* pos_id = engine_->GetITensor("pos_id");
         auto* mask_id = engine_->GetITensor("mask_id");
+
+        // reduce_sum: (-1,headsize,token_length,token_length) ->
+        // (-1,token_length)
+        uint32_t reduce_dim = 0;
+        reduce_dim |= 1 << 1;  // 00000000000000000000000000000010
+        reduce_dim |= 1 << 2;  // 00000000000000000000000000000110
+        bool keep_dim = false;
+        nvinfer1::ReduceOperation reduce_type = nvinfer1::ReduceOperation::kSUM;
+        auto* reduce_sum_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Reduce, *Attn, reduce_type, reduce_dim, keep_dim);
+        // reduce_sum_layer->getOutput(0)->setType(reduce_sum_layer->getInput(0)->getType());
+
+        auto* Reduced = reduce_sum_layer->getOutput(0);
         std::vector<nvinfer1::ITensor*> itensors = {
-            Attn, X, Mask, NewMask, word_id, pos_id, mask_id};
+            Reduced, X, Mask, NewMask, word_id, pos_id, mask_id};
         layer = engine_->AddDynamicPlugin(itensors.data(), 7, plugin);
 
         layer->getOutput(0)->setName(output_name.c_str());
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index fe011422c19e95ca47404bf7054a1eb02c9e7b2c..b0c800d31bf3a05db9cdd46aee9f3150626476f4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -31,19 +31,15 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-#if IS_TRT_VERSION_GE(6000)
-
 template <typename T>
 __global__ void ElementwiseMask(const T* a,
                                 const T* b,
                                 T* res,
                                 int num_elements) {
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= num_elements) return;
   const T zero = 0;
   res[tid] = b[tid] >= zero ? a[tid] : zero;
-#endif
 }
 
 template <typename T>
@@ -123,7 +119,6 @@ __global__ void ReduceSum2(
 template <>
 __global__ void ReduceSum2<half>(
     const half* src, half* dst, int bsz, int nb_head, int max_seq_len) {
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len);
@@ -155,7 +150,6 @@ __global__ void ReduceSum2<half>(
         static_cast<size_t>(bsz * max_seq_len),
         static_cast<platform::float16>(res_half[0]));
   }
-#endif
 }
 
 template <typename T>
@@ -177,14 +171,81 @@ __global__ void TakeAlongAxis(const T* src,
   }
 }
 
-__global__ void pos_id_prune_kernel(const int32_t* src,
-                                    int32_t* dst,
-                                    int pos_nums,
-                                    float scale) {
-  dst[0] = 0;
-  for (int i = 1; i < pos_nums; i++) {
-    dst[i] =
-        dst[i - 1] + max(static_cast<int>((src[i] - src[i - 1]) * scale), 2);
+__global__ void compute_token_length(const int32_t* src,
+                                     int32_t* dst,
+                                     float scale) {
+  int32_t it = threadIdx.x;
+  dst[it] = max(static_cast<int>((src[it + 1] - src[it]) * scale), 1);
+}
+
+__global__ void fill_index_padding_score(int32_t* token_index,
+                                         const half* scores,
+                                         int32_t scores_size,
+                                         half* padding_scores) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  token_index[tid] = threadIdx.x;
+  if (tid < scores_size) {
+    padding_scores[tid] = scores[tid];
+  } else {
+    padding_scores[tid] = 0;
+  }
+}
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+__global__ void general_topk_pair_sort(T* in_keys, int32_t* in_out_values) {
+  typedef cub::BlockRadixSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, int>
+      BlockRadixSort;
+  typedef cub::
+      BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+          BlockLoadKey;
+  typedef cub::
+      BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
+          BlockLoadValue;
+  typedef cub::
+      BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_STORE_TRANSPOSE>
+          BlockStoreKey;
+  typedef cub::BlockStore<int,
+                          BLOCK_THREADS,
+                          ITEMS_PER_THREAD,
+                          cub::BLOCK_STORE_TRANSPOSE>
+      BlockStoreValue;
+
+  __shared__ union {
+    typename BlockRadixSort::TempStorage sort;
+    typename BlockLoadKey::TempStorage loadkey;
+    typename BlockLoadValue::TempStorage loadvalue;
+    typename BlockStoreKey::TempStorage storekey;
+    typename BlockStoreValue::TempStorage storevalue;
+  } temp_storage;
+
+  int block_offset = blockIdx.x * BLOCK_THREADS * ITEMS_PER_THREAD;
+
+  T thread_keys[ITEMS_PER_THREAD];
+  int thread_values[ITEMS_PER_THREAD];
+  BlockLoadKey(temp_storage.loadkey).Load(in_keys + block_offset, thread_keys);
+  BlockLoadValue(temp_storage.loadvalue)
+      .Load(in_out_values + block_offset, thread_values);
+  __syncthreads();
+
+  BlockRadixSort(temp_storage.sort).SortDescending(thread_keys, thread_values);
+  __syncthreads();
+
+  BlockStoreValue(temp_storage.storevalue)
+      .Store(in_out_values + block_offset, thread_values);
+}
+
+__global__ void varlen_prune_token(const half* tokens,
+                                   const int32_t* token_pos,
+                                   const int32_t* token_index,
+                                   half* output) {
+  int batch = blockIdx.x;
+  int token_it = batch * gridDim.y + blockIdx.y;
+  int pre_value_it =
+      token_it * gridDim.z * blockDim.x + blockIdx.z * blockDim.x + threadIdx.x;
+
+  if (token_index[token_it] < token_pos[batch + 1] - token_pos[batch]) {
+    output[(token_index[token_it] + token_pos[batch]) * gridDim.z * blockDim.x +
+           blockIdx.z * blockDim.x + threadIdx.x] = tokens[pre_value_it];
   }
 }
 
@@ -195,9 +256,29 @@ nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions(
     nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   auto x_dims = inputs[1], new_mask_dims = inputs[3];
   if (flag_varseqlen_) {
+    // max sum of seqlen: ceil(sum / scale) + n -1 >= for(i=0;i<n;i++) {sum +=
+    // floor(num(i) / scale)} auto
+    // pruned_sum_length=std::ceil(inputs[4].d[0]*new_mask_dims.d[2]/inputs[6].d[1])+
+    // inputs[1].d[0] - 1;
+    auto pruned_sum_length = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUB,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kSUM,
+            *expr_builder.operation(
+                nvinfer1::DimensionOperation::kCEIL_DIV,
+                *expr_builder.operation(nvinfer1::DimensionOperation::kPROD,
+                                        *inputs[4].d[0],
+                                        *new_mask_dims.d[2]),
+                *inputs[6].d[1]),
+            *inputs[1].d[0]),
+        *expr_builder.constant(1));
     if (output_index == 0) {
-      nvinfer1::DimsExprs ret = x_dims;
-      ret.d[1] = new_mask_dims.d[2];
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = 4;
+      ret.d[0] = pruned_sum_length;
+      ret.d[1] = x_dims.d[2];
+      ret.d[2] = expr_builder.constant(1);
+      ret.d[3] = expr_builder.constant(1);
       return ret;
     } else if (output_index == 1) {
       nvinfer1::DimsExprs ret;
@@ -209,18 +290,7 @@ nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions(
       // word id
       nvinfer1::DimsExprs ret;
       ret.nbDims = 1;
-      // max sum of seqlen: pre_seqlen * new_mask[2] / mask[1] + 2 * batchs
-      const auto* two = expr_builder.constant(2);
-      ret.d[0] = expr_builder.operation(
-          nvinfer1::DimensionOperation::kSUM,
-          *expr_builder.operation(
-              nvinfer1::DimensionOperation::kFLOOR_DIV,
-              *expr_builder.operation(nvinfer1::DimensionOperation::kPROD,
-                                      *inputs[4].d[0],
-                                      *new_mask_dims.d[2]),
-              *inputs[6].d[1]),
-          *expr_builder.operation(
-              nvinfer1::DimensionOperation::kPROD, *two, *inputs[6].d[0]));
+      ret.d[0] = pruned_sum_length;
       return ret;
     } else if (output_index == 3) {
       // pos id
@@ -269,26 +339,18 @@ bool FusedTokenPrunePluginDynamic::supportsFormatCombination(
 
   const nvinfer1::PluginTensorDesc& in = in_out[pos];
   if (flag_varseqlen_) {
-    if (pos == 0) {
+    if (pos <= 3 || pos == 7) {
       if (with_fp16_) {
-#ifdef TRT_PLUGIN_FP16_AVALIABLE
-        return (in.type == nvinfer1::DataType::kFLOAT ||
-                in.type == nvinfer1::DataType::kHALF) &&
-               (in.format == nvinfer1::TensorFormat::kLINEAR);
-#else
-        return (in.type == nvinfer1::DataType::kFLOAT) &&
+        return (in.type == nvinfer1::DataType::kHALF) &&
                (in.format == nvinfer1::TensorFormat::kLINEAR);
-#endif
       } else {
-        return (in.type == nvinfer1::DataType::kFLOAT) &&
-               (in.format == nvinfer1::TensorFormat::kLINEAR);
+        PADDLE_THROW(platform::errors::Fatal(
+            "The FusedTokenPrune TRT Plugin's input type "
+            "should be half for varseqlen."));
       }
-    } else if (pos <= 3 || pos == 7) {
-      const nvinfer1::PluginTensorDesc& prev = in_out[0];
-      return in.type == prev.type && in.format == prev.format;
     } else if (pos == 6 || pos == 11) {  // mask_id, mask_id_out
-      return in.type == nvinfer1::DataType::kFLOAT &&
-             in.format == nvinfer1::TensorFormat::kLINEAR;
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
     } else {
       return in.type == nvinfer1::DataType::kINT32 &&
              in.format == nvinfer1::TensorFormat::kLINEAR;
@@ -296,14 +358,9 @@ bool FusedTokenPrunePluginDynamic::supportsFormatCombination(
   } else {
     if (pos == 0) {
       if (with_fp16_) {
-#ifdef TRT_PLUGIN_FP16_AVALIABLE
-        return (in.type == nvinfer1::DataType::kFLOAT ||
-                in.type == nvinfer1::DataType::kHALF) &&
+        return (in.type == nvinfer1::DataType::kHALF) &&
                (in.format == nvinfer1::TensorFormat::kLINEAR);
-#else
-        return (in.type == nvinfer1::DataType::kFLOAT) &&
-               (in.format == nvinfer1::TensorFormat::kLINEAR);
-#endif
+
       } else {
         return (in.type == nvinfer1::DataType::kFLOAT) &&
                (in.format == nvinfer1::TensorFormat::kLINEAR);
@@ -324,9 +381,9 @@ nvinfer1::DataType FusedTokenPrunePluginDynamic::getOutputDataType(
     int nb_inputs) const TRT_NOEXCEPT {
   if (flag_varseqlen_) {
     if (index == 0) {
-      return input_types[1];
-    } else if (index == 4) {
-      return nvinfer1::DataType::kFLOAT;
+      return nvinfer1::DataType::kHALF;
+    } else if (index == 4) {  // mask id
+      return input_types[6];
     } else {
       // index = 1,2,3
       return nvinfer1::DataType::kINT32;
@@ -557,14 +614,6 @@ inline void enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc,
   }
 }
 
-inline void pos_id_prune(const int32_t* input,
-                         int32_t* output,
-                         int pos_nums,
-                         float scale,
-                         cudaStream_t stream) {
-  pos_id_prune_kernel<<<1, 1, 0, stream>>>(input, output, pos_nums, scale);
-}
-
 int FusedTokenPrunePluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc* input_desc,
     const nvinfer1::PluginTensorDesc* output_desc,
@@ -572,73 +621,153 @@ int FusedTokenPrunePluginDynamic::enqueue(
     void* const* outputs,
     void* workspace,
     cudaStream_t stream) TRT_NOEXCEPT {
-  auto input_type = input_desc[0].type;
-  auto attn_dims = input_desc[0].dims;
-  auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1],
-       max_seq_len = attn_dims.d[2];
-  int device_id;
-  cudaGetDevice(&device_id);
-
-  if (input_type == nvinfer1::DataType::kFLOAT) {
-    VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32";
-
-    float max = std::numeric_limits<float>::max();
-
-    enqueueImpl<float>(input_desc,
-                       output_desc,
-                       inputs,
-                       outputs,
-                       workspace,
-                       stream,
-                       device_id,
-                       max,
-                       keep_first_token_,
-                       keep_order_);
-
-  } else if (input_type == nvinfer1::DataType::kHALF) {
-#ifdef TRT_PLUGIN_FP16_AVALIABLE
-    VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16";
-
-    half max = 65504.0;
-
-    enqueueImpl<half>(input_desc,
-                      output_desc,
-                      inputs,
-                      outputs,
-                      workspace,
-                      stream,
-                      device_id,
-                      max,
-                      keep_first_token_,
-                      keep_order_);
-
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "The Ernie(Bert) TensorRT Plugin should be "
-        "complied with CUDA version >= 10.0 when running with fp16. "
-        "Please recomplie it or try to use fp32 by set "
-        "config.SetTRTDynamicShapeInfo(min_input_shape, "
-        "max_input_shape, opt_input_shape, true"));
-#endif
-  } else {
-    PADDLE_THROW(
-        platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type "
-                                "should be float or half."));
-  }
   if (flag_varseqlen_) {
+    if (!(input_desc[0].type == nvinfer1::DataType::kHALF &&
+          input_desc[1].type == nvinfer1::DataType::kHALF)) {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Token_prune'type must half"));
+    }
     float scale =
         static_cast<float>(input_desc[3].dims.d[2]) / input_desc[6].dims.d[1];
-    // outputs[2]=inputs[4]; // word_id
-    const int32_t* inputs5 = static_cast<const int32_t*>(inputs[5]);
-    int32_t* outputs3 = static_cast<int32_t*>(outputs[3]);
-    pos_id_prune(
-        inputs5, outputs3, input_desc[5].dims.d[0], scale, stream);  // pos_id
-    // outputs[4]=inputs[6]; // new_mask
+    const int32_t* inputs5 =
+        static_cast<const int32_t*>(inputs[5]);             // pre pos id
+    int32_t* outputs3 = static_cast<int32_t*>(outputs[3]);  // new pos id
+    half* outputs0 = static_cast<half*>(outputs[0]);
+
+    const int32_t B = input_desc[1].dims.d[0];  // batchs
+    const int32_t max_sequnce_length =
+        input_desc[1].dims.d[1];                     // max sequnce length
+    const int32_t length = input_desc[1].dims.d[2];  // vector length
+    const half* scores = static_cast<const half*>(inputs[0]);  // reduce sum
+    const half* tokens = static_cast<const half*>(inputs[1]);
+    const int32_t scores_size = B * max_sequnce_length;
+    int32_t padding_token_length;
+    if (max_sequnce_length <= 128) {
+      padding_token_length = 128;
+    } else if (max_sequnce_length <= 256) {
+      padding_token_length = 256;
+    } else if (max_sequnce_length <= 384) {
+      padding_token_length = 384;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Token_prune'token_length must <= 384"));
+    }
+
+    // 1. Compute the token length after pruning.
+    compute_token_length<<<1, B, 0, stream>>>(
+        inputs5, pruned_token_lengths_, scale);
+
+    fill_index_padding_score<<<B, padding_token_length, 0, stream>>>(
+        token_index_, scores, scores_size, padding_scores_);
+
+    // Determine temporary device storage requirements
+    void* d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  pruned_token_lengths_,
+                                  outputs3,
+                                  B + 1);
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run exclusive prefix sum
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  pruned_token_lengths_,
+                                  outputs3,
+                                  B + 1);
+
+    if (padding_token_length == 128) {
+      general_topk_pair_sort<half, 32, 4>
+          <<<B, 32, 0, stream>>>(padding_scores_, token_index_);  // 128
+    } else if (padding_token_length == 256) {
+      general_topk_pair_sort<half, 64, 4>
+          <<<B, 64, 0, stream>>>(padding_scores_, token_index_);  // 256
+    } else {
+      general_topk_pair_sort<half, 96, 4>
+          <<<B, 96, 0, stream>>>(padding_scores_, token_index_);  // 384
+    }
+
+    int32_t num_threads;
+    if (length < 1024) {
+      num_threads = length;
+    } else {
+      if (length % 512 == 0) {
+        num_threads = 512;
+      } else if (length % 256 == 0) {
+        num_threads = 256;
+      } else if (length % 128 == 0) {
+        num_threads = 128;
+      } else if (length % 64 == 0) {
+        num_threads = 64;
+      } else if (length % 32 == 0) {
+        num_threads = 32;
+      } else if (length % 16 == 0) {
+        num_threads = 16;
+      } else if (length % 8 == 0) {
+        num_threads = 8;
+      } else if (length % 4 == 0) {
+        num_threads = 4;
+      } else if (length % 2 == 0) {
+        num_threads = 2;
+      } else {
+        num_threads = 1;
+      }
+    }
+    const dim3 num_blocks(
+        B,
+        max_sequnce_length,
+        length / num_threads);  //  batchs, max_sequnce_length, vector_ength/***
+    varlen_prune_token<<<num_blocks, num_threads, 0, stream>>>(
+        tokens, outputs3, token_index_, outputs0);
+  } else {
+    auto input_type = input_desc[0].type;
+    auto attn_dims = input_desc[0].dims;
+    auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1],
+         max_seq_len = attn_dims.d[2];
+    int device_id;
+    cudaGetDevice(&device_id);
+
+    if (input_type == nvinfer1::DataType::kFLOAT) {
+      VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32";
+
+      float max = std::numeric_limits<float>::max();
+
+      enqueueImpl<float>(input_desc,
+                         output_desc,
+                         inputs,
+                         outputs,
+                         workspace,
+                         stream,
+                         device_id,
+                         max,
+                         keep_first_token_,
+                         keep_order_);
+
+    } else if (input_type == nvinfer1::DataType::kHALF) {
+      VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16";
+
+      half max = 65504.0;
+      enqueueImpl<half>(input_desc,
+                        output_desc,
+                        inputs,
+                        outputs,
+                        workspace,
+                        stream,
+                        device_id,
+                        max,
+                        keep_first_token_,
+                        keep_order_);
+    } else {
+      PADDLE_THROW(
+          platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type "
+                                  "should be float or half."));
+    }
   }
   return cudaGetLastError() != cudaSuccess;
 }
 
-#endif
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
index 0b32e8a552bb7e6e1710df13d4d5a6fc6c03458d..4c9c24c59afa29279a615016f4342fa411ff30d5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace inference {
@@ -30,11 +31,10 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
                                         bool keep_first_token,
                                         bool keep_order,
                                         bool flag_varseqlen)
-      : keep_first_token_(keep_first_token),
+      : with_fp16_(with_fp16),
+        keep_first_token_(keep_first_token),
         keep_order_(keep_order),
-        flag_varseqlen_(flag_varseqlen) {
-    with_fp16_ = with_fp16;
-  }
+        flag_varseqlen_(flag_varseqlen) {}
   FusedTokenPrunePluginDynamic(void const* serial_data, size_t serial_length) {
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
     DeserializeValue(&serial_data, &serial_length, &keep_first_token_);
@@ -42,8 +42,14 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &flag_varseqlen_);
   }
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
-    return new FusedTokenPrunePluginDynamic(
+    FusedTokenPrunePluginDynamic* ptr = new FusedTokenPrunePluginDynamic(
         with_fp16_, keep_first_token_, keep_order_, flag_varseqlen_);
+    ptr->max_batchs_ = max_batchs_;
+    ptr->max_token_length_ = max_token_length_;
+    ptr->pruned_token_lengths_ = pruned_token_lengths_;
+    ptr->token_index_ = token_index_;
+    ptr->padding_scores_ = padding_scores_;
+    return ptr;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
@@ -84,7 +90,16 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nb_outputs) TRT_NOEXCEPT override {}
+                       int nb_outputs) TRT_NOEXCEPT override {
+    max_batchs_ = in[1].max.d[0];
+    max_token_length_ = in[1].max.d[1];
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&pruned_token_lengths_,
+                                          (max_batchs_ + 1) * sizeof(int32_t)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
+        &token_index_, max_batchs_ * max_token_length_ * sizeof(int32_t)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
+        &padding_scores_, max_batchs_ * max_token_length_ * sizeof(half)));
+  }
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
@@ -106,9 +121,15 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
   void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
+  bool with_fp16_;
   bool keep_first_token_;
   bool keep_order_;
   bool flag_varseqlen_;
+  int32_t* pruned_token_lengths_;
+  int32_t* token_index_;
+  int32_t max_batchs_;
+  int32_t max_token_length_;
+  half* padding_scores_;
 };
 
 class FusedTokenPrunePluginDynamicCreator : public nvinfer1::IPluginCreator {
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
index c6be871709452aefd5cbbb2db35323623b72bcf4..50884b79d835eb506dac9f4e7474c6ceb828a1fd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
@@ -19,9 +19,9 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-__global__ void RecoverPaddingKernel(const float* input0,
+__global__ void RecoverPaddingKernel(const half* input0,
                                      const int32_t* input1,
-                                     float* output) {
+                                     half* output) {
   int word_id = blockIdx.x * gridDim.y + blockIdx.y;
   int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x];
   if (blockIdx.y < seqence_length) {
@@ -79,7 +79,7 @@ bool RecoverPaddingPlugin::supportsFormatCombination(
     return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   } else {
-    return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+    return inOut[pos].type == nvinfer1::DataType::kHALF &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   }
   // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
@@ -114,38 +114,43 @@ int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
   const auto input0_desc = inputDesc[0];
   const auto input1_desc = inputDesc[1];
   const auto input2_desc = inputDesc[2];
-  const float* input0 = static_cast<const float*>(inputs[0]);
+  const half* input0 = static_cast<const half*>(inputs[0]);
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
-  float* output = static_cast<float*>(outputs[0]);
+  half* output = static_cast<half*>(outputs[0]);
+  const int32_t vector_length = input0_desc.dims.d[1];
   int32_t num_threads;
-  if (input0_desc.dims.d[1] % 512 == 0) {
-    num_threads = 512;
-  } else if (input0_desc.dims.d[1] % 256 == 0) {
-    num_threads = 256;
-  } else if (input0_desc.dims.d[1] % 128 == 0) {
-    num_threads = 128;
-  } else if (input0_desc.dims.d[1] % 64 == 0) {
-    num_threads = 64;
-  } else if (input0_desc.dims.d[1] % 32 == 0) {
-    num_threads = 32;
-  } else if (input0_desc.dims.d[1] % 16 == 0) {
-    num_threads = 16;
-  } else if (input0_desc.dims.d[1] % 8 == 0) {
-    num_threads = 8;
-  } else if (input0_desc.dims.d[1] % 4 == 0) {
-    num_threads = 4;
-  } else if (input0_desc.dims.d[1] % 2 == 0) {
-    num_threads = 2;
+  if (vector_length < 1024) {
+    num_threads = vector_length;
   } else {
-    num_threads = 1;
+    if (vector_length % 512 == 0) {
+      num_threads = 512;
+    } else if (vector_length % 256 == 0) {
+      num_threads = 256;
+    } else if (vector_length % 128 == 0) {
+      num_threads = 128;
+    } else if (vector_length % 64 == 0) {
+      num_threads = 64;
+    } else if (vector_length % 32 == 0) {
+      num_threads = 32;
+    } else if (vector_length % 16 == 0) {
+      num_threads = 16;
+    } else if (vector_length % 8 == 0) {
+      num_threads = 8;
+    } else if (vector_length % 4 == 0) {
+      num_threads = 4;
+    } else if (vector_length % 2 == 0) {
+      num_threads = 2;
+    } else {
+      num_threads = 1;
+    }
   }
   const dim3 num_blocks(
       input1_desc.dims.d[0] - 1,
       input2_desc.dims.d[1],
-      input0_desc.dims.d[1] / num_threads);  //  batchs, max sequnce length
-                                             //  (mask_id.dims.d[1]),
-                                             //  input.dims.d[1]/256
+      vector_length / num_threads);  //  batchs, max sequnce length
+                                     //  (mask_id.dims.d[1]),
+                                     //  input.dims.d[1]/***
   RecoverPaddingKernel<<<num_blocks, num_threads, 0, stream>>>(
       input0, input1, output);
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
index 9f1a1d6d2c109a54a8419c787445daf156c81552..a18c0d0c7294b3e8a37586771fe1dc12cf164361 100644
--- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
@@ -19,9 +19,9 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-__global__ void RemovePaddingKernel(const float* input0,
+__global__ void RemovePaddingKernel(const half* input0,
                                     const int32_t* input1,
-                                    float* output) {
+                                    half* output) {
   int word_id = blockIdx.x * gridDim.y + blockIdx.y;
   int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x];
   if (blockIdx.y < seqence_length) {
@@ -73,7 +73,7 @@ bool RemovePaddingPlugin::supportsFormatCombination(
     return inOut[pos].type == nvinfer1::DataType::kINT32 &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   } else {
-    return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+    return inOut[pos].type == nvinfer1::DataType::kHALF &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   }
   // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
@@ -106,38 +106,43 @@ int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                  void* workspace,
                                  cudaStream_t stream) TRT_NOEXCEPT {
   const auto input_desc = inputDesc[0];
-  const float* input0 = static_cast<const float*>(inputs[0]);
+  const half* input0 = static_cast<const half*>(inputs[0]);
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
-  float* output = static_cast<float*>(outputs[0]);
+  half* output = static_cast<half*>(outputs[0]);
   const auto input0_desc = inputDesc[0];
+  const int32_t vector_length = input0_desc.dims.d[2];
   int32_t num_threads;
-  if (input0_desc.dims.d[2] % 512 == 0) {
-    num_threads = 512;
-  } else if (input0_desc.dims.d[2] % 256 == 0) {
-    num_threads = 256;
-  } else if (input0_desc.dims.d[2] % 128 == 0) {
-    num_threads = 128;
-  } else if (input0_desc.dims.d[2] % 64 == 0) {
-    num_threads = 64;
-  } else if (input0_desc.dims.d[2] % 32 == 0) {
-    num_threads = 32;
-  } else if (input0_desc.dims.d[2] % 16 == 0) {
-    num_threads = 16;
-  } else if (input0_desc.dims.d[2] % 8 == 0) {
-    num_threads = 8;
-  } else if (input0_desc.dims.d[2] % 4 == 0) {
-    num_threads = 4;
-  } else if (input0_desc.dims.d[2] % 2 == 0) {
-    num_threads = 2;
+  if (vector_length < 1024) {
+    num_threads = vector_length;
   } else {
-    num_threads = 1;
+    if (vector_length % 512 == 0) {
+      num_threads = 512;
+    } else if (vector_length % 256 == 0) {
+      num_threads = 256;
+    } else if (vector_length % 128 == 0) {
+      num_threads = 128;
+    } else if (vector_length % 64 == 0) {
+      num_threads = 64;
+    } else if (vector_length % 32 == 0) {
+      num_threads = 32;
+    } else if (vector_length % 16 == 0) {
+      num_threads = 16;
+    } else if (vector_length % 8 == 0) {
+      num_threads = 8;
+    } else if (vector_length % 4 == 0) {
+      num_threads = 4;
+    } else if (vector_length % 2 == 0) {
+      num_threads = 2;
+    } else {
+      num_threads = 1;
+    }
   }
   const dim3 num_blocks(
       input0_desc.dims.d[0],
       input0_desc.dims.d[1],
-      input0_desc.dims.d[2] /
-          num_threads);  //  batchs, max sequnce length, input.dims.d[2]/256
+      vector_length /
+          num_threads);  //  batchs, max sequnce length, input0.dims.d[2]/***
 
   RemovePaddingKernel<<<num_blocks, num_threads, 0, stream>>>(
       input0, input1, output);
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
index 4cc20c436597578c69a95bc6ae0acc3e332bc45c..543e7dca22c31e068b80dc8a98e0e26c78572c15 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
@@ -26,12 +26,9 @@ TEST(fused_token_prune_op_plugin, test_plugin) {
                                       /*keep_first_token*/ false,
                                       /*keep_order*/ true,
                                       /*flag_varseqlen*/ false);
-  plugin.configurePlugin(nullptr, 4, nullptr, 2);
   plugin.initialize();
   plugin.getPluginType();
   plugin.getNbOutputs();
-  auto clone_plugin = plugin.clone();
-  clone_plugin->destroy();
   size_t buf_size = plugin.getSerializationSize();
   std::vector<char> buf(buf_size);
   plugin.serialize(buf.data());