提交 330fa95c 编写于 作者: F fengjiayi

Follow comments

上级 bfe08446
......@@ -139,7 +139,7 @@ struct TestBroadcastOpHandle {
PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
f::Tensor result_tensor;
f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor, true);
f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place);
for (int64_t i = 0; i < f::product(kDims); ++i) {
......@@ -185,7 +185,7 @@ struct TestBroadcastOpHandle {
}
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor, true);
f::TensorCopySync(rt, cpu_place, &result_tensor);
float* ct = result_tensor.data<float>();
for (int64_t i = 0; i < f::product(kDims); ++i) {
......
......@@ -66,8 +66,7 @@ void FetchOpHandle::RunImpl() {
auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i], true);
dev_ctxes_.at(t.place())->Wait();
TensorCopySync(t, cpu, &tensors_[i]);
#endif
} else {
tensors_[i].ShareDataWith(t);
......
......@@ -194,8 +194,7 @@ struct TestReduceOpHandle {
}
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor,
true);
f::TensorCopySync(rt, cpu_place, &result_tensor);
float *ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
......@@ -240,8 +239,7 @@ struct TestReduceOpHandle {
auto &rt = out_var->Get<f::LoDTensor>();
f::Tensor result_tensor;
f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor,
true);
f::TensorCopySync(rt, cpu_place, &result_tensor);
float *ct = result_tensor.data<float>();
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
......
......@@ -20,7 +20,7 @@ namespace paddle {
namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst, bool sync) {
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
......@@ -48,9 +48,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
......@@ -61,9 +59,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
......@@ -72,9 +68,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream =
sync ? nullptr
: reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
.stream();
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
}
#endif
......@@ -92,6 +86,41 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
TensorCopy(src, dst_place, *dev_ctx, dst);
}
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
}
#endif
}
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
......
......@@ -24,10 +24,11 @@ namespace paddle {
namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst,
bool sync = false);
const platform::DeviceContext& ctx, Tensor* dst);
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
template <typename T>
void TensorFromVector(const std::vector<T>& src,
......
......@@ -57,9 +57,7 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
auto &dev_ctx = *pool.Get(src_item.place());
TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item, true);
TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
dst_item.set_lod(src_item.lod());
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
......
......@@ -62,7 +62,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input, true);
TensorCopySync(input_tmp, *place, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
......@@ -87,8 +87,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -99,8 +98,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
......@@ -121,7 +119,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input, true);
TensorCopySync(input_tmp, *place, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
......@@ -130,7 +128,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -142,7 +140,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input, true);
TensorCopySync(input_tmp, *place, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
......@@ -150,7 +148,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......
......@@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out, true);
TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place);
......@@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) {
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out, true);
TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
......@@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out, true);
TensorCopySync(out_gpu, cpu_place, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input1, gpu_place, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place);
......@@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) {
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out, true);
TensorCopySync(out_gpu, cpu_place, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
......@@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
......@@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
......@@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
TensorCopySync(input3_gpu, cpu_place, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
......@@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);
......@@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
TensorCopySync(input1, gpu_place, &input1_gpu);
TensorCopySync(input2, gpu_place, &input2_gpu);
TensorCopySync(input3, gpu_place, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
......@@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
TensorCopySync(input3_gpu, cpu_place, &input3);
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
......@@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i);
}
TensorCopy(mat_a, gpu_place, context, &g_mat_a, true);
TensorCopy(vec_b, gpu_place, context, &g_vec_b, true);
TensorCopySync(mat_a, gpu_place, &g_mat_a);
TensorCopySync(vec_b, gpu_place, &g_vec_b);
paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c, true);
TensorCopySync(g_vec_c, cpu_place, &vec_c);
if (!trans) {
for (int i = 0; i < m; ++i) {
......
......@@ -71,7 +71,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopy(input_tmp, *place, *context, &input, true);
paddle::framework::TensorCopySync(input_tmp, *place, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
......@@ -85,8 +85,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
......@@ -100,7 +99,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input, true);
TensorCopySync(input_tmp, *place, &input);
}
paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
......@@ -110,7 +109,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
in_ptr = input_tmp.data<float>();
}
......
......@@ -180,8 +180,7 @@ void DoubleBufferReader::PrefetchThreadFunc() {
auto* gpu_ctx = ctxs_[cached_tensor_id].get();
gpu_batch.resize(cpu_batch.size());
for (size_t i = 0; i < cpu_batch.size(); ++i) {
framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i],
true);
framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
gpu_batch[i].set_lod(cpu_batch[i].lod());
}
}
......
......@@ -124,10 +124,8 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto *shape_data = shape_tensor->data<int>();
framework::Tensor cpu_shape_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
&cpu_shape_tensor);
TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
ctx.device_context().Wait();
}
auto shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel());
......@@ -146,9 +144,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
out->Resize(out_dims);
if (!inplace) {
out->mutable_data<T>(ctx.GetPlace());
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
ctx.device_context().Wait();
// TensorCopy will resize to in_dims.
framework::TensorCopySync(*in, ctx.GetPlace(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册