diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 59a0aef480bfe1b1e63e8fb2344c5e22390a8f1b..6bca299813f166009bc33512e2154907d869cf56 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -109,7 +109,7 @@ void AllReduceOpHandle::RunImpl() {
               buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
               ncclSum, comm, stream));
           // TODO(Yancey1989): synchronize here can get better performance
-          // if don't use NCCL group call, but need more profileing.
+          // if don't use NCCL group call, but need more profiling.
           if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream);
         });
       }