diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index dec761399f0f149429242a40df4a75b4fd90e840..8f1b6d1615312fced0887f9ff14ae17877371b7e 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -139,7 +139,7 @@ struct TestBroadcastOpHandle { PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); f::Tensor result_tensor; - f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor, true); + f::TensorCopySync(out_tensor, cpu_place, &result_tensor); float* ct = result_tensor.mutable_data(cpu_place); for (int64_t i = 0; i < f::product(kDims); ++i) { @@ -185,7 +185,7 @@ struct TestBroadcastOpHandle { } f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor, true); + f::TensorCopySync(rt, cpu_place, &result_tensor); float* ct = result_tensor.data(); for (int64_t i = 0; i < f::product(kDims); ++i) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 423449abff97dbf70d81314f852d9135e25f243f..1e8ca20b51d43554cf1898b41b31c27b90e6c642 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -66,8 +66,7 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true); - dev_ctxes_.at(t.place())->Wait(); + TensorCopySync(t, cpu, &tensors_[i]); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index c557875013b653e76118915381459ae8dc9d3b56..ffdd7c14eb5097cc8285da090e4a72e1e3f43d86 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -194,8 +194,7 @@ struct TestReduceOpHandle { } f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor, - true); + f::TensorCopySync(rt, cpu_place, &result_tensor); float *ct = result_tensor.data(); for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { @@ -240,8 +239,7 @@ struct TestReduceOpHandle { auto &rt = out_var->Get(); f::Tensor result_tensor; - f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor, - true); + f::TensorCopySync(rt, cpu_place, &result_tensor); float *ct = result_tensor.data(); for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d2e60ab1dd16758a91d22ef6872edc5053ef88b3..e5bc74755f46449296a153e8b330968e6d9f1e1d 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, Tensor* dst, bool sync) { + const platform::DeviceContext& ctx, Tensor* dst) { VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; src.check_memory_size(); @@ -48,9 +48,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { @@ -61,9 +59,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { @@ -72,9 +68,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto stream = - sync ? nullptr - : reinterpret_cast(ctx) - .stream(); + reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } #endif @@ -92,6 +86,41 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, TensorCopy(src, dst_place, *dev_ctx, dst); } +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() + << " to " << dst_place; + src.check_memory_size(); + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + auto size = src.numel() * SizeOfType(src.type()); + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } +#endif +} + template struct AnyDTypeVisitor { Predicate predicate_; diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 3af68402dc56230171e858bf8f8f8c89c2bfe760..dca279b69382b80e055f661cefe84b81326704b5 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -24,10 +24,11 @@ namespace paddle { namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, Tensor* dst, - bool sync = false); + const platform::DeviceContext& ctx, Tensor* dst); void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst); +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst); template void TensorFromVector(const std::vector& src, diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index 462d5181200326ab121bd6a8c3b98267e23615a0..18deec58137676a0b2c8d559e49d0f7a840cd5ba 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -57,9 +57,7 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? - auto &dev_ctx = *pool.Get(src_item.place()); - - TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item, true); + TensorCopySync(src_item, platform::CPUPlace(), &dst_item); dst_item.set_lod(src_item.lod()); VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index b09f95e0b2f6fb7033b586b5d433e44593e29461..887c64b32a6d320c55983a558d2f1046b205786a 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -62,7 +62,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input, true); + TensorCopySync(input_tmp, *place, &input); } output_cfo.mutable_data( {1, filter_size, filter_size, output_height, output_width}, *place); @@ -87,8 +87,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output_cfo.data(); } else { - TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp, - true); + TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp); out_cfo_ptr = output_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -99,8 +98,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { out_ocf_ptr = output_ocf.data(); } else { - TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp, - true); + TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp); out_ocf_ptr = output_tmp.data(); } @@ -121,7 +119,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input, true); + TensorCopySync(input_tmp, *place, &input); } col2im(*context, output_cfo, dilation, stride, padding, &input); @@ -130,7 +128,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { @@ -142,7 +140,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input, true); + TensorCopySync(input_tmp, *place, &input); } col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); @@ -150,7 +148,7 @@ void testIm2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } for (int i = 0; i < 6; ++i) { diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index ccfe0c6c079bb357bf63965ed7a80b0b2e65b6d9..7986326e96b2bb05c0936d366bda581d49b87032 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input1, gpu_place, context, &input2_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out, true); + TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input1, gpu_place, context, &input2_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); @@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) { context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out, true); + TensorCopySync(out_gpu, cpu_place, &out); float16* out_ptr = out.data(); context.Wait(); @@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input1, gpu_place, context, &input2_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopy(out_gpu, cpu_place, context, &out, true); + TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) { float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input1, gpu_place, context, &input2_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); @@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) { context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, float16(0)); - TensorCopy(out_gpu, cpu_place, context, &out, true); + TensorCopySync(out_gpu, cpu_place, &out); float16* out_ptr = out.data(); context.Wait(); @@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input2, gpu_place, context, &input2_gpu, true); - TensorCopy(input3, gpu_place, context, &input3_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3, true); + TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input2, gpu_place, context, &input2_gpu, true); - TensorCopy(input3, gpu_place, context, &input3_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3, true); + TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input2, gpu_place, context, &input2_gpu, true); - TensorCopy(input3, gpu_place, context, &input3_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3, true); + TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) { float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopy(input1, gpu_place, context, &input1_gpu, true); - TensorCopy(input2, gpu_place, context, &input2_gpu, true); - TensorCopy(input3, gpu_place, context, &input3_gpu, true); + TensorCopySync(input1, gpu_place, &input1_gpu); + TensorCopySync(input2, gpu_place, &input2_gpu); + TensorCopySync(input3, gpu_place, &input3_gpu); float16* a = input1_gpu.data(); float16* b = input2_gpu.data(); float16* c = input3_gpu.mutable_data(gpu_place); @@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), c + 1, 4); - TensorCopy(input3_gpu, cpu_place, context, &input3, true); + TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); @@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - TensorCopy(mat_a, gpu_place, context, &g_mat_a, true); - TensorCopy(vec_b, gpu_place, context, &g_vec_b, true); + TensorCopySync(mat_a, gpu_place, &g_mat_a); + TensorCopySync(vec_b, gpu_place, &g_vec_b); paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - TensorCopy(g_vec_c, cpu_place, context, &vec_c, true); + TensorCopySync(g_vec_c, cpu_place, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index 9de47dcc25192dbbb0b8144cf83ae3a316ca75f9..1b9551b6f2e029434acaf6464c8a00b333ea9e81 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -71,7 +71,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - paddle::framework::TensorCopy(input_tmp, *place, *context, &input, true); + paddle::framework::TensorCopySync(input_tmp, *place, &input); } output.mutable_data({1, filter_size, filter_size, filter_size, output_depth, output_height, output_width}, @@ -85,8 +85,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { out_cfo_ptr = output.data(); } else { - TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp, - true); + TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp); out_cfo_ptr = output_tmp.data(); } @@ -100,7 +99,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { - TensorCopy(input_tmp, *place, *context, &input, true); + TensorCopySync(input_tmp, *place, &input); } paddle::operators::math::Col2VolFunctor col2vol; @@ -110,7 +109,7 @@ void testVol2col() { if (paddle::platform::is_cpu_place(*place)) { in_ptr = input.data(); } else { - TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true); + TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp); in_ptr = input_tmp.data(); } diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 4372f23fc1dbd85e43b04a9d644977392316c2e9..afbe56234530b6c1314d8eef1eb8cf1241aeacbf 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -180,8 +180,7 @@ void DoubleBufferReader::PrefetchThreadFunc() { auto* gpu_ctx = ctxs_[cached_tensor_id].get(); gpu_batch.resize(cpu_batch.size()); for (size_t i = 0; i < cpu_batch.size(); ++i) { - framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i], - true); + framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]); gpu_batch[i].set_lod(cpu_batch[i].lod()); } } diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 8320c257c9ab15efec29eabe99eca5b6f74c9e31..987b43cc92b0b064a6282f0f2484010987b2e041 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -124,10 +124,8 @@ class ReshapeKernel : public framework::OpKernel { auto *shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { - TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), - &cpu_shape_tensor); + TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); - ctx.device_context().Wait(); } auto shape = std::vector(shape_data, shape_data + shape_tensor->numel()); @@ -146,9 +144,7 @@ class ReshapeKernel : public framework::OpKernel { out->Resize(out_dims); if (!inplace) { out->mutable_data(ctx.GetPlace()); - framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); - ctx.device_context().Wait(); - // TensorCopy will resize to in_dims. + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); } else { out->ShareDataWith(*in);