未验证 提交 08773b60 编写于 作者: Z Zeng Jinle 提交者: GitHub

fix reader default stream,test=develop (#17106)

上级 aa5307ce
......@@ -39,7 +39,7 @@ TEST(RetryAllocator, RetryAllocator) {
std::unique_ptr<LockedAllocator> locked_allocator(
new LockedAllocator(std::move(best_fit_allocator)));
size_t thread_num = 32;
size_t thread_num = 8;
size_t sleep_time = 40;
size_t extra_time = 10;
......
......@@ -74,12 +74,6 @@ void BufferedReader::ReadTillBufferFullAsync() {
}
void BufferedReader::ReadAsync(size_t i) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
}
#endif
position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
TensorVec &cpu = cpu_buffer_[i];
reader_->ReadNext(&cpu);
......@@ -94,17 +88,34 @@ void BufferedReader::ReadAsync(size_t i) {
// issues the copying command to the default stream, it will make two
// commands from different streams cannot run concurrently.
if (platform::is_gpu_place(place_)) {
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
TensorVec &gpu = gpu_buffer_[i];
if (gpu.empty()) {
gpu.resize(cpu.size());
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
} else {
PADDLE_ENFORCE_EQ(gpu.size(), cpu.size(),
"Input tensor number not matched");
}
std::vector<void *> gpu_ptrs;
gpu_ptrs.reserve(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
gpu[i].Resize(cpu[i].dims());
gpu[i].set_layout(cpu[i].layout());
gpu_ptrs.emplace_back(gpu[i].mutable_data(place_, cpu[i].type()));
}
// NOTE(zjl): cudaStreamWaitEvent() must be called after all
// gpu[i].mutable_data() is called, since some ops release
// gpu memory immediately without waiting gpu kernel ends
platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data<void>();
auto gpu_ptr = gpu[i].mutable_data(place_, cpu[i].type());
auto gpu_ptr = gpu_ptrs[i];
auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if (platform::is_cuda_pinned_place(cpu_place)) {
......@@ -116,12 +127,9 @@ void BufferedReader::ReadAsync(size_t i) {
boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
size, stream_);
} else {
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
// TODO(zcd): The default stream should not be used here.
memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
0);
stream_);
}
gpu[i].set_lod(cpu[i].lod());
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册