diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index daa19eb17c88240ad27ee422ae70469868089ca1..f1b8a20e41cc223b5e68e66eaa8221c4aec01295 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -245,7 +245,7 @@ struct FetchOpHandle : public OpHandle {
 
 class ParallelExecutorPrivate {
  public:
-  explicit ParallelExecutorPrivate(size_t num_threads = 12)
+  explicit ParallelExecutorPrivate(size_t num_threads = 0)
       : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {}
 
   std::vector<platform::Place> places_;
@@ -669,7 +669,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const {
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
-  bool use_event = false;
+  bool use_event = true;
   auto fetched_data = std::make_shared<FetchedData>(fetch_tensors.size());
   // Version --> VarHandle
   member_->exception_.reset();
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index a12cdd45aa10c28abf14e55f0249682bc82706bc..1985f1f4e68db1e62ee7cfd3649312581840d02c 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -90,7 +90,6 @@ size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
 template <>
 void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
   auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  VLOG(30) << "Allocating " << size << " bytes on " << place;
   auto* ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
     int cur_dev = platform::GetCurrentDeviceId();