diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 7daab6dac19768e1d35c84bfd78d319c8a62512b..97557d2b14a7eacbfe3338a8c09bb6065b68f81f 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <deque>
 #include <memory>
-#include <queue>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
-    std::queue<OpHandleBase *> op_queue;
-    op_queue.push(op);
+    std::deque<OpHandleBase *> op_queue;
+    op_queue.push_front(op);
 
     size_t complete = 0;
     while (!op_queue.empty()) {
-      OpHandleBase *op_to_run = op_queue.front();
-      op_queue.pop();
+      OpHandleBase *op_to_run = op_queue.back();
+      op_queue.pop_back();
 
       if (!RunOp(op_to_run, complete_q, &complete)) {
         return;
@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
           // NOTE(zjl): op with highest priority should run
           // first without switching to another thread.
           if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
-            op_queue.push(pending_op);
+            op_queue.push_back(pending_op);
           } else {
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
@@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
         }
       }
 
-      if (op_to_run != nullptr) op_queue.push(op_to_run);
+      if (op_to_run != nullptr) {
+        op_queue.push_front(op_to_run);
+      }
     }
     --remaining_;
     complete_q->Push(complete);
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index defe97cd6f2d4e5a9ca3fd1880d8bbfc0989e482..29f2de5de0699ff4bda5deecae4e9e02ed74f150 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -409,8 +409,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
-  // FIXME(zjl): recurrent_op is rather complex, we would
-  // disable gc forcely in recurrent_op
   if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {