diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 33bd275e5cc507ec700b3694cd8b1df9672ec512..7d551106756070a14f94f39f19b775d022d90777 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor { int g_find_max; memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max, - sizeof(int), 0); + sizeof(int), ctx.stream()); + ctx.Wait(); if (g_find_max) { int len; memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data, - sizeof(int), 0); + sizeof(int), ctx.stream()); + ctx.Wait(); FindAbsMaxFunctor()(ctx, scale_arr, len, out_scale_data); } @@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor { const auto gpu_place = boost::get(ctx.GetPlace()); T accum; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), 0); T state; - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), 0); T scale; + memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), + sizeof(T), ctx.stream()); + memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), + sizeof(T), ctx.stream()); memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - 0); - + ctx.stream()); + ctx.Wait(); state = rate * state + 1; accum = rate * accum + scale; scale = accum / state; memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), 0); + platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), 0); + platform::CPUPlace(), &state, sizeof(T), ctx.stream()); memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), 0); + platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); + ctx.Wait(); } };