diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index 7c7f3e9059fbb1e3f2cca4f04edfff55c9452761..462d5181200326ab121bd6a8c3b98267e23615a0 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -59,8 +59,7 @@ class FetchOp : public framework::OperatorBase { // CPU outputs? auto &dev_ctx = *pool.Get(src_item.place()); - TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item); - dev_ctx.Wait(); + TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item, true); dst_item.set_lod(src_item.lod()); VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 1741af8148bb90863f294ba4930006a58b5ddbf9..854a8ee4425edbf3f07177589b4d512027cc3394 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -72,8 +72,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopy(input_a_cpu, Place(), *context, &input_a, true); + TensorCopy(input_b_cpu, Place(), *context, &input_b, true); } std::vector input; @@ -89,7 +89,7 @@ void testConcat() { int* out_ptr; if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopy(out, CPUPlace(), *context, &out_cpu, true); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -144,8 +144,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopy(input_a_cpu, Place(), *context, &input_a, true); + TensorCopy(input_b_cpu, Place(), *context, &input_b, true); } input.clear(); @@ -159,7 +159,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopy(out, CPUPlace(), *context, &out_cpu, true); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -216,8 +216,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopy(input_a_cpu, Place(), *context, &input_a, true); + TensorCopy(input_b_cpu, Place(), *context, &input_b, true); } input.clear(); @@ -231,7 +231,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopy(out, CPUPlace(), *context, &out_cpu, true); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); @@ -290,8 +290,8 @@ void testConcat() { } if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(input_a_cpu, Place(), *context, &input_a); - TensorCopy(input_b_cpu, Place(), *context, &input_b); + TensorCopy(input_a_cpu, Place(), *context, &input_a, true); + TensorCopy(input_b_cpu, Place(), *context, &input_b, true); } input.clear(); @@ -305,7 +305,7 @@ void testConcat() { PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); if (paddle::platform::is_gpu_place(Place())) { - TensorCopy(out, CPUPlace(), *context, &out_cpu); + TensorCopy(out, CPUPlace(), *context, &out_cpu, true); out_ptr = out_cpu.data(); } else { out_ptr = out.data(); diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index b3978536bca7dc276bf5417bed36761a2d496536..b09f95e0b2f6fb7033b586b5d433e44593e29461 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -62,7 +62,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopy(input_tmp, *place, *context, &input, true); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -87,7 +87,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp, + true); out_cfo_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -98,7 +99,8 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp, + true); out_ocf_ptr = output_tmp.data(); } @@ -119,7 +121,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopy(input_tmp, *place, *context, &input, true); } col2im(*context, output_cfo, dilation, stride, padding, &input); @@ -128,7 +130,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -140,7 +142,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopy(input_tmp, *place, *context, &input, true); } col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); @@ -148,7 +150,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 8982d9d066165a9da0461288685baa0c60e5f114..ccfe0c6c079bb357bf63965ed7a80b0b2e65b6d9 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input1, gpu_place, context, &input2_gpu, true); out_gpu.mutable_data({2, 2}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out, true); float* out_ptr = out.data(); context.Wait(); @@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input1, gpu_place, context, &input2_gpu, true); out_gpu.mutable_data({2, 2}, gpu_place); @@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) { context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out, true); float16* out_ptr = out.data(); context.Wait(); @@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input1, gpu_place, context, &input2_gpu, true); out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out, true); float* out_ptr = out.data(); context.Wait(); @@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input1, gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input1, gpu_place, context, &input2_gpu, true); out_gpu.mutable_data({3, 3}, gpu_place); @@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) { context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out, true); float16* out_ptr = out.data(); context.Wait(); @@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input2, gpu_place, context, &input2_gpu, true); + TensorCopy(input3, gpu_place, context, &input3_gpu, true); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3, true); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input2, gpu_place, context, &input2_gpu, true); + TensorCopy(input3, gpu_place, context, &input3_gpu, true); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3, true); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input2, gpu_place, context, &input2_gpu, true); + TensorCopy(input3, gpu_place, context, &input3_gpu, true); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3, true); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu); - TensorCopy(input2, gpu_place, context, &input2_gpu); - TensorCopy(input3, gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu, true); + TensorCopy(input2, gpu_place, context, &input2_gpu, true); + TensorCopy(input3, gpu_place, context, &input3_gpu, true); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3, true); context.Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); @@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - TensorCopy(mat_a, gpu_place, context, &g_mat_a); - TensorCopy(vec_b, gpu_place, context, &g_vec_b); + TensorCopy(mat_a, gpu_place, context, &g_mat_a, true); + TensorCopy(vec_b, gpu_place, context, &g_vec_b, true); paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - TensorCopy(g_vec_c, cpu_place, context, &vec_c); + TensorCopy(g_vec_c, cpu_place, context, &vec_c, true); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index eb91f862e39d9f158200f69fd48e7245dde47171..9de47dcc25192dbbb0b8144cf83ae3a316ca75f9 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -71,7 +71,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - paddle::framework::TensorCopy(input_tmp, *place, *context, &input); + paddle::framework::TensorCopy(input_tmp, *place, *context, &input, true); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -85,7 +85,8 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp); + TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp, + true); out_cfo_ptr = output_tmp.data(); } @@ -99,7 +100,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input); + TensorCopy(input_tmp, *place, *context, &input, true); } paddle::operators::math::Col2VolFunctor col2vol; @@ -109,7 +110,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); + TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); in_ptr = input_tmp.data(); } diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc index 20b8a5c98ab16ac8121cb2fd01deb8ecc1966d44..ef54d79fdf2becde98c68044d14bd4347773b975 100644 --- a/paddle/fluid/operators/nccl_op_test.cu.cc +++ b/paddle/fluid/operators/nccl_op_test.cu.cc @@ -228,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) { result_tensor->Resize(kDims); auto *ct = result_tensor->mutable_data(cpu_place); - paddle::memory::Copy( - cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, - recv_tensor.numel() * sizeof(float), - static_cast(dev_ctxs_[kRoot])->stream()); + paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, + recv_tensor.numel() * sizeof(float), nullptr); for (int64_t j = 0; j < f::product(kDims); ++j) { ASSERT_NEAR(ct[j], expected_result, 1e-5);