diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index b0bf641d9d0b54f4788b14e25caf317c8eea3c27..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -103,50 +103,23 @@ void BroadcastOpHandle::RunImpl() {
           });
     }
 
-    // FIXME(zcd): a temporary fix for some language model that has sparse
-    // parameter.
-    bool use_mutex = true;
-    if (in_var->IsType<paddle::framework::SelectedRows>()) {
-      use_mutex = false;
-    }
-    if (use_mutex) {
-      this->RunAndRecordEvent([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
-        }
-      });
-    } else {
-      this->RunAndRecordEventNoMutex([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
+    this->RunAndRecordEvent([&] {
+      {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : broadcast_calls) {
+          call();
         }
-      });
-    }
+      }
 
+      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                           ->FindVar(out_var_handles[0]->name_);
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle->place_,
+            *(dev_ctxes_.at(in_var_handle->place_)),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
+    });
 #else
     PADDLE_THROW("CUDA is not enabled.");
 #endif
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index a40a8815087f246996e4601b36304afd5544234e..1f84c3b9e2d7ee9ae51959988fceeb3451b7b3b8 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>
 
 namespace paddle {
 namespace framework {
@@ -122,35 +122,17 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
   if (!events_.empty()) {  // Use event
     std::function<void()> method = callback;
-
+    // NOTE(zcd): device context must be ordered here because RecordEvent
+    // will use a mutex to ensure the safe of multi-threads.
+    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
     for (auto &p : dev_ctxes_) {
-      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-            method);
-      };
+      ordered_ctxes.emplace(p.second, p.first);
     }
-    method();
-  } else {
-#endif
-    callback();
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-
-void OpHandleBase::RunAndRecordEventNoMutex(
-    const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (!events_.empty()) {  // Use event
-    std::function<void()> method = callback;
-
-    for (auto &p : dev_ctxes_) {
+    for (auto &p : ordered_ctxes) {
       method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)
-            ->RecordEventNoMutex(
-                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-                method);
+        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
+            method);
       };
     }
     method();
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 775be0233a4a841dd210edbaa2da42dd739eae80..fbd90a3296bca92b097cab925b218b91e7f4752f 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -85,10 +85,6 @@ class OpHandleBase {
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
-
   void RunAndRecordEvent(platform::Place p,
                          const std::function<void()> &callback);
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 9a626c890fa20b9d69812acbe8d899c3f72b1ca3..7160e346dad0615e2fd32b70c096880af0359e1a 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -80,9 +80,7 @@ void ReduceOpHandle::RunImpl() {
   }
 
   if (pre_in_var->IsType<framework::SelectedRows>()) {
-    // FIXME(zcd): A temporary fix for some language model that has sparse
-    // parameter.
-    this->RunAndRecordEventNoMutex([&] {
+    this->RunAndRecordEvent([&] {
       std::vector<const SelectedRows *> in_selected_rows =
           GetInputValues<SelectedRows>(in_var_handles, var_scopes);
       GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d37e5ee57859ec90de8a99416a1600b32796f46e..292ffef1aef12732812b8c5b0020cad73b1d06fc 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,14 +106,6 @@ class CUDADeviceContext : public DeviceContext {
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
 
-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  template <typename Callback>
-  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
-  }
-
  private:
   CUDAPlace place_;