diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 6dc53c9649e9d54ab627825b59ab13d779c742bd..05215a9e5f14bb3ee2407b560e94bfec0b9a5c3f 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope,
       for (auto &t : *lod_tensor_arr) {
         garbages.emplace_back(t.MoveMemoryHolder());
       }
+      // NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is
+      // wrong, if size() decrease in next step, an error maybe occur.
+      lod_tensor_arr->clear();
     } else if (var->IsType<Strings>()) {
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f..0fe76fa23a6373bc6710c25342057927871f7e3f 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const platform::CUDADeviceContext &ctx, const uint32_t rows,
     const uint32_t cols, const int vec_size) {
   const uint32_t tmp_cols = cols / vec_size;
-  int threads = std::max(
-      static_cast<uint32_t>(32),
-      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
+  // needs too many register resources. If data_type is float16, CUDA
+  // error(701) will occur when block_size is 1024. Which error is
+  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
+  // occur because it did not have appropriate resources.
+  // Of course, this kernel can be optimized later to reduce the use
+  // of registers.
+  int threads =
+      std::max(static_cast<uint32_t>(32),
+               std::min(tmp_cols, static_cast<uint32_t>(std::min(
+                                      ctx.GetMaxThreadsPerBlock(), 512))));
   const auto blocks_x =
       std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
   const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e..caceac1228e0a0f3ee88fab08e2de3625f74838a 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
                  dropout_prob, is_upscale_in_train, is_test);
     }
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     // add residual
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
         src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
         out.data<T>(), *ctx);
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
   }
 
   void FusedBackward() {
@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-3));
 }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
+  // Used to test that `cudaErrorLaunchOutOfResources` will not occur
+  int rows = 1;
+  int cols = 12288;
+  if (std::getenv("_rows") != nullptr) {
+    rows = atoi(std::getenv("_rows"));
+  }
+  if (std::getenv("_cols") != nullptr) {
+    cols = atoi(std::getenv("_cols"));
+  }
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
+                                                       true);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-1));
+  test.CheckGrad(static_cast<platform::float16>(1e-1));
+}