diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index daa19eb17c88240ad27ee422ae70469868089ca1..f1b8a20e41cc223b5e68e66eaa8221c4aec01295 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -245,7 +245,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 12) + explicit ParallelExecutorPrivate(size_t num_threads = 0) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -669,7 +669,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { - bool use_event = false; + bool use_event = true; auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index a12cdd45aa10c28abf14e55f0249682bc82706bc..1985f1f4e68db1e62ee7cfd3649312581840d02c 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -90,7 +90,6 @@ size_t Used(platform::CUDAPlace place) { template <> void* Alloc(platform::CUDAPlace place, size_t size) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - VLOG(30) << "Allocating " << size << " bytes on " << place; auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId();