diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index 3f2dcde3e9597287d72046dd4f8b07faab1ede25..8f1b6d1615312fced0887f9ff14ae17877371b7e 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -139,7 +139,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); f::Tensor result_tensor; - f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor); + f::TensorCopySync(out_tensor, cpu_place, &result_tensor); float* ct = result_tensor.mutable_data(cpu_place); for (int64_t i = 0; i < f::product(kDims); ++i) { @@ -185,7 +185,7 @@ struct TestBroadcastOpHandle { } f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor); + f::TensorCopySync(rt, cpu_place, &result_tensor); float* ct = result_tensor.data(); for (int64_t i = 0; i < f::product(kDims); ++i) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 423449abff97dbf70d81314f852d9135e25f243f..1e8ca20b51d43554cf1898b41b31c27b90e6c642 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -66,8 +66,7 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true); - dev_ctxes_.at(t.place())->Wait(); + TensorCopySync(t, cpu, &tensors_[i]); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index c17aabee53680fba10eac289cf8f8bd5f7d419e8..ffdd7c14eb5097cc8285da090e4a72e1e3f43d86 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -194,7 +194,7 @@ struct TestReduceOpHandle { } f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + f::TensorCopySync(rt, cpu_place, &result_tensor); float *ct = result_tensor.data(); for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { @@ -239,7 +239,7 @@ struct TestReduceOpHandle { auto &rt = out_var->Get(); f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + f::TensorCopySync(rt, cpu_place, &result_tensor); float *ct = result_tensor.data(); for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d2e60ab1dd16758a91d22ef6872edc5053ef88b3..e5bc74755f46449296a153e8b330968e6d9f1e1d 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, Tensor* dst, bool sync) { + const platform::DeviceContext& ctx, Tensor* dst) { VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; src.check_memory_size(); @@ -48,9 +48,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { @@ -61,9 +59,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { @@ -72,9 +68,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } #endif @@ -92,6 +86,41 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, TensorCopy(src, dst_place, *dev_ctx, dst); } +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() + << " to " << dst_place; + src.check_memory_size(); + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + auto size = src.numel() * SizeOfType(src.type()); + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } +#endif +} + template struct AnyDTypeVisitor { Predicate predicate_; diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 3af68402dc56230171e858bf8f8f8c89c2bfe760..dca279b69382b80e055f661cefe84b81326704b5 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -24,10 +24,11 @@ namespace paddle { namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, Tensor* dst, - bool sync = false); + const platform::DeviceContext& ctx, Tensor* dst); void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst); template void TensorFromVector(const std::vector& src, diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index 7c7f3e9059fbb1e3f2cca4f04edfff55c9452761..18deec58137676a0b2c8d559e49d0f7a840cd5ba 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -57,10 +57,7 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - auto &dev_ctx = *pool.Get(src_item.place()); - - TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item); - dev_ctx.Wait(); + TensorCopySync(src_item, platform::CPUPlace(), &dst_item); dst_item.set_lod(src_item.lod()); VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 1741af8148bb90863f294ba4930006a58b5ddbf9..19d056fa54777eff2881a346da071ff95126173c 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -72,8 +72,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopySync(input_a_cpu, Place(), &input_a); + TensorCopySync(input_b_cpu, Place(), &input_b); } std::vector input; @@ -89,7 +89,7 @@ void testConcat() { int* out_ptr; if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopySync(out, CPUPlace(), &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -144,8 +144,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopySync(input_a_cpu, Place(), &input_a); + TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -159,7 +159,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopySync(out, CPUPlace(), &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -216,8 +216,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopySync(input_a_cpu, Place(), &input_a); + TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -231,7 +231,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopySync(out, CPUPlace(), &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -290,8 +290,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopySync(input_a_cpu, Place(), &input_a); + TensorCopySync(input_b_cpu, Place(), &input_b); } input.clear(); @@ -305,7 +305,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopySync(out, CPUPlace(), &out_cpu); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index b497c2a66146eed7b4b39ead6b05a9977cd4be21..8e3f0f286823c383bb0c44d0e7887040ec9b20a0 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -63,7 +63,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopySync(input_tmp, *place, &input); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -88,7 +88,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp); out_cfo_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -99,7 +99,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp); out_ocf_ptr = output_tmp.data(); } @@ -120,7 +120,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopySync(input_tmp, *place, &input); } col2im(*context, output_cfo, dilation, stride, padding, &input); @@ -129,7 +129,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -141,7 +141,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopySync(input_tmp, *place, &input); } col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); @@ -149,7 +149,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 8982d9d066165a9da0461288685baa0c60e5f114..7986326e96b2bb05c0936d366bda581d49b87032 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); @@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) { context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopySync(out_gpu, cpu_place, &out); float16* out_ptr = out.data(); context.Wait(); @@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); @@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) { context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopySync(out_gpu, cpu_place, &out); float16* out_ptr = out.data(); context.Wait(); @@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); @@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - TensorCopy(mat_a, gpu_place, context, &g_mat_a); - TensorCopy(vec_b, gpu_place, context, &g_vec_b); + TensorCopySync(mat_a, gpu_place, &g_mat_a); + TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - TensorCopy(g_vec_c, cpu_place, context, &vec_c); + TensorCopySync(g_vec_c, cpu_place, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index b4e4186c986605b4fd14b10cadab81f07be4b27b..aa979c4f10907e604758c3e2cfb776cb994c9ceb 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -72,7 +72,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - paddle::framework::TensorCopy(input_tmp, *place, *context, &input); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -86,7 +86,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp); out_cfo_ptr = output_tmp.data(); } @@ -100,7 +100,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopySync(input_tmp, *place, &input); } paddle::operators::math::Col2VolFunctor col2vol; @@ -110,7 +110,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index 20b8a5c98ab16ac8121cb2fd01deb8ecc1966d44..ef54d79fdf2becde98c68044d14bd4347773b975 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -228,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, - recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs_[kRoot])->stream()); + paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, + recv_tensor.numel() * sizeof(float), nullptr); for (int64_t j = 0; j < f::product(kDims); ++j) { ASSERT_NEAR(ct[j], expected_result, 1e-5); diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 3fdc31dfa5242b6487c308d395d70d7ff348bc73..e5efac461512a9a1869318d6547233589ca45a77 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -168,11 +168,10 @@ void DoubleBufferReader::PrefetchThreadFunc() { } if (platform::is_gpu_place(place_)) { auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id]; - auto* gpu_ctx = ctxs_[cached_tensor_id].get(); gpu_batch.resize(cpu_batch.size()); for (size_t i = 0; i < cpu_batch.size(); ++i) { - framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i], - true); + // TODO(fengjiayi): Use asynchronous TensorCopy instead + framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]); gpu_batch[i].set_lod(cpu_batch[i].lod()); } } diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 44a91ebd7c2d5760d88747e089bfa8cc660a7e86..ccd7063fe69e0f21b4d2a821bb70902b39c9b9de 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -130,10 +130,8 @@ class ReshapeKernel : public framework::OpKernel { auto *shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { - TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), - &cpu_shape_tensor); + TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); - ctx.device_context().Wait(); } auto shape = std::vector(shape_data, shape_data + shape_tensor->numel()); @@ -152,9 +150,7 @@ class ReshapeKernel : public framework::OpKernel { out->Resize(out_dims); if (!inplace) { out->mutable_data(ctx.GetPlace()); - framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); - ctx.device_context().Wait(); - // TensorCopy will resize to in_dims. + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); } else { out->ShareDataWith(*in);