tensor_util.cc 37.5 KB
Newer Older
Y
Yang Yu 已提交
1 2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yang Yu 已提交
6

7 8 9 10 11 12 13
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yang Yu 已提交
14

15 16
#include "paddle/fluid/framework/tensor_util.h"

C
chengduoZH 已提交
17 18
#include <algorithm>
#include <limits>
C
chengduo 已提交
19
#include <memory>
20
#include <string>
C
chengduo 已提交
21
#include <utility>
C
chengduoZH 已提交
22
#include <vector>
23

24
#include "paddle/fluid/framework/convert_utils.h"
Y
yuyang18 已提交
25
#include "paddle/fluid/framework/data_type.h"
26
#include "paddle/fluid/platform/complex.h"
27
#include "paddle/fluid/platform/profiler/event_tracing.h"
28
#include "paddle/phi/core/dense_tensor.h"
29

30
#ifdef PADDLE_WITH_MKLDNN
31
#include "dnnl_debug.h"  // NOLINT
32
#endif
Y
Yang Yu 已提交
33 34 35

namespace paddle {
namespace framework {
Y
Yi Wang 已提交
36

37
template <typename TENSOR>
38 39 40 41
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
                    const platform::DeviceContext& ctx,
                    TENSOR* dst) {
42 43
  if (&src == dst) {
    auto src_copy = src;
44
    TensorCopyImpl(src_copy, dst_place, ctx, dst);
45 46
    return;
  }
M
minqiyang 已提交
47 48
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
Y
Yi Wang 已提交
49 50 51 52
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
  auto src_place = src.place();
53
  auto src_ptr = src.data();
54
#ifdef PADDLE_WITH_MKLDNN
55
  dst->set_mem_desc(src.mem_desc());
56 57 58
  // oneDNN tensors due to padding may be of bigger size
  // than numel()*size(type())
  auto dst_ptr =
59
      src.layout() == DataLayout::ONEDNN
60 61
          ? dst->mutable_data(dst_place, src.dtype(), src.memory_size())
          : dst->mutable_data(dst_place, src.dtype());
62
#else
63
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
64
#endif
65
  dst->set_layout(src.layout());
66 67 68 69 70
  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
            << dst_place;
    return;
  }
71
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
72

73
#ifdef PADDLE_WITH_MKLDNN
74
  auto size = src.layout() == DataLayout::ONEDNN
75
                  ? src.memory_size()
76
                  : src.numel() * phi::SizeOf(src.dtype());
77
#else
78
  auto size = src.numel() * phi::SizeOf(src.dtype());
79
#endif
Y
Yi Wang 已提交
80 81

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
82
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
Y
Yi Wang 已提交
83
  }
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_custom_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
#endif
107 108 109
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
110
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
111 112
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
113
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
114 115 116 117 118 119 120
  } else if (platform::is_xpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
121
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
122 123 124 125 126
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
127
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
128 129
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
130
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
131
  }
132
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
Y
Yi Wang 已提交
133
           platform::is_cpu_place(dst_place)) {
134
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
135 136 137
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
138
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
139 140 141
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
142 143
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
Y
Yi Wang 已提交
144
    auto ctx_place = ctx.GetPlace();
145
    PADDLE_ENFORCE_EQ(
146 147
        platform::is_gpu_place(ctx_place),
        true,
148 149 150
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
151
    auto ctx_gpu_place = ctx_place;
152 153
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
154 155 156
                      platform::errors::Unavailable(
                          "Source place and context place do not match, source "
                          "place is %s, context place is %s.",
157 158
                          src_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
159
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
160
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
161 162 163
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
164 165
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
166
    auto ctx_place = ctx.GetPlace();
167
    PADDLE_ENFORCE_EQ(
168 169
        platform::is_gpu_place(ctx_place),
        true,
170 171 172
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
173
    auto ctx_gpu_place = ctx_place;
174 175
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
176 177 178
                      platform::errors::Unavailable(
                          "Destination place and context place do not match, "
                          "destination place is %s, context place is %s.",
179 180
                          dst_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
181
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
182
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
183 184 185
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
186 187
    auto src_gpu_place = src_place;
    auto dst_cuda_pinned_place = dst_place;
188
    auto ctx_place = ctx.GetPlace();
189 190 191 192 193 194 195
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from GPU memory to CUDA Pinned memory, current "
            "device context place should be GPU."));
196
    auto ctx_gpu_place = ctx_place;
197 198
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
199 200 201 202
                      platform::errors::PreconditionNotMet(
                          "The source GPU device and current device context do "
                          "not match. The source GPU device number is %d, but "
                          "device context GPU number is %d.",
203 204
                          src_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
205
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
206 207
    memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
208 209 210
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
211 212
    auto src_cuda_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
213
    auto ctx_place = ctx.GetPlace();
214 215 216 217 218 219 220
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from CUDA Pinned memory to GPU memory, current "
            "device context place should be GPU."));
221
    auto ctx_gpu_place = ctx_place;
222 223
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
224 225 226 227
                      platform::errors::PreconditionNotMet(
                          "The target GPU device and current device context do "
                          "not match. The target GPU device number is %d, but "
                          "device context GPU number is %d.",
228 229
                          dst_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
230
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
231 232
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
233 234 235
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
236 237
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
238
    auto ctx_place = ctx.GetPlace();
239
    PADDLE_ENFORCE_EQ(
240 241
        platform::is_gpu_place(ctx_place),
        true,
242 243 244
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
L
Leo Chen 已提交
245
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
C
chengduo 已提交
246
    if (platform::is_same_place(src_place, dst_place)) {
247 248
      memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
249 250
    } else {
      if (platform::is_same_place(ctx_place, src_place)) {
251 252
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
253
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
C
chengduo 已提交
254
      } else if (platform::is_same_place(ctx_place, dst_place)) {
C
chengduo 已提交
255
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
256 257
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
258
      } else {
259 260
        PADDLE_THROW(platform::errors::Unavailable(
            "Context place dose not match the source and destination place."));
C
chengduo 已提交
261 262
      }
    }
263 264
  }
  else {  // NOLINT
265 266
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
Y
Yi Wang 已提交
267 268 269 270
  }
#endif
}

271
template <typename TENSOR>
272 273
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
274
                    TENSOR* dst) {
Y
Yi Wang 已提交
275 276
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
张春乔 已提交
277
  if (platform::is_gpu_place(dst_place) ||
278
      platform::is_custom_place(dst_place)) {
Y
Yi Wang 已提交
279
    dev_ctx = pool.Get(dst_place);
C
chengduo 已提交
280 281
  } else {
    dev_ctx = pool.Get(src.place());
Y
Yi Wang 已提交
282
  }
283 284 285
  TensorCopyImpl(src, dst_place, *dev_ctx, dst);
}

286
void TensorCopy(const phi::DenseTensor& src,
287
                const platform::Place& dst_place,
288 289
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, dst);
290
}
291
void TensorCopy(const phi::DenseTensor& src,
292 293
                const platform::Place& dst_place,
                const platform::DeviceContext& ctx,
294 295
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, ctx, dst);
296
}
Y
Yi Wang 已提交
297

298
void TensorCopySync(const phi::DenseTensor& src,
299
                    const platform::Place& dst_place,
300
                    phi::DenseTensor* dst) {
301 302 303 304 305 306
  if (&src == dst) {
    auto src_copy = src;
    TensorCopySync(src_copy, dst_place, dst);
    return;
  }

M
minqiyang 已提交
307 308
  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
          << " to " << dst_place;
F
fengjiayi 已提交
309 310 311
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
J
Jacek Czaja 已提交
312
#ifdef PADDLE_WITH_MKLDNN
313
  if (src.layout() == DataLayout::ONEDNN) {
314 315
    dst->set_mem_desc(src.mem_desc());
  }
J
Jacek Czaja 已提交
316
#endif
F
fengjiayi 已提交
317
  auto src_place = src.place();
318
  auto src_ptr = src.data();
319
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
320
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
321 322 323 324 325 326

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data from " << src_place << " to "
            << dst_place;
    return;
  }
327
  auto size = src.numel() * phi::SizeOf(src.dtype());
F
fengjiayi 已提交
328
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
329
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
F
fengjiayi 已提交
330
  }
331 332 333 334
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
335
  }                                                // NOLINT
336 337 338
  else if (platform::is_cpu_place(src_place) &&    // NOLINT
           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
339
  }                                                 // NOLINT
340 341 342 343 344 345 346 347 348 349 350
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_custom_place(
               dst_place)) { /* custom_device -> custom_device*/
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  }
#endif
351 352 353
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
354
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
355
  }                                              // NOLINT
J
jianghaicheng 已提交
356 357
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
358
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
359
  }                                              // NOLINT
J
jianghaicheng 已提交
360 361
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
362 363 364 365 366
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
367 368 369
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
    platform::XPUPlace xpu_dst_place = dst_place;
    platform::XPUPlace xpu_src_place = src_place;
370 371 372 373
    if (xpu_dst_place.device == xpu_src_place.device) {
      auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
      xpu_ctx->Wait();
    }
A
Allen Guo 已提交
374
  }       // NOLINT
J
jianghaicheng 已提交
375
  else {  // NOLINT
376 377 378 379
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
380
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
381 382
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
383
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
384
  }
385
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
F
fengjiayi 已提交
386
           platform::is_cpu_place(dst_place)) {
387
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
388 389 390
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
391
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
392 393 394
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
395
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
396 397 398
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
399 400
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
F
fengjiayi 已提交
401
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
402 403 404
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
405 406
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
407
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
408 409 410
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
411 412
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
413
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
414 415 416
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
417 418
    auto src_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
419 420
    memory::Copy(
        dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr);
421 422
  }
  else {  // NOLINT
423 424
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
F
fengjiayi 已提交
425 426
  }
#endif
A
Allen Guo 已提交
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
#ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
F
fengjiayi 已提交
450 451
}

452
void TensorToStream(std::ostream& os,
453
                    const phi::DenseTensor& tensor,
Y
Yi Wang 已提交
454 455 456 457 458 459 460 461 462
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
  }
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
    proto::VarType::TensorDesc desc;
463
    desc.set_data_type(framework::TransToProtoVarType(tensor.dtype()));
464
    auto dims = phi::vectorize(tensor.dims());
Y
Yi Wang 已提交
465 466 467 468 469 470 471 472 473
    auto* pb_dims = desc.mutable_dims();
    pb_dims->Resize(static_cast<int>(dims.size()), 0);
    std::copy(dims.begin(), dims.end(), pb_dims->begin());
    int32_t size = desc.ByteSize();
    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
    auto out = desc.SerializeAsString();
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
474
    uint64_t size = tensor.numel() * phi::SizeOf(tensor.dtype());
Y
yuyang18 已提交
475

476
    auto* data_ptr = tensor.data();
477 478
    PADDLE_ENFORCE_LT(size,
                      (std::numeric_limits<std::streamsize>::max)(),
T
tangwei12 已提交
479 480
                      platform::errors::ResourceExhausted(
                          "tensor size %d overflow when writing tensor", size));
Y
Yi Wang 已提交
481
    if (platform::is_gpu_place(tensor.place())) {
482
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yi Wang 已提交
483 484
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
L
Leo Chen 已提交
485
      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
Y
Yi Wang 已提交
486 487 488 489
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
490 491 492 493 494
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
Y
Yi Wang 已提交
495 496 497 498 499 500 501
                     gpu_dev_ctx.stream());
        gpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
T
tangwei12 已提交
502 503
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported when not compiled with CUDA"));
504 505 506 507 508 509 510 511 512 513 514
#endif
    } else if (platform::is_xpu_place(tensor.place())) {
#ifdef PADDLE_WITH_XPU
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& xpu_dev_ctx =
          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
515 516 517 518 519
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write);
520 521 522 523 524 525 526 527
        xpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
528 529 530 531 532 533 534 535 536 537 538
#endif
    } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& custom_device_context =
          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
539 540 541 542 543
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
544 545 546 547 548 549 550 551 552 553
                     custom_device_context.stream());
        custom_device_context.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported when not compiled with "
          "CustomDevice"));
Y
Yi Wang 已提交
554 555 556 557 558 559 560 561 562
#endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
               static_cast<std::streamsize>(size));
    }
  }
}

struct DeserializedDataFunctor {
563
  DeserializedDataFunctor(void** buf,
564
                          phi::DenseTensor* tensor,
Y
Yi Wang 已提交
565 566 567 568
                          const platform::Place& place)
      : buf_(buf), tensor_(tensor), place_(place) {}

  template <typename T>
D
dzhwinter 已提交
569
  void apply() {
Y
Yi Wang 已提交
570 571 572 573
    *buf_ = tensor_->mutable_data<T>(place_);
  }

  void** buf_;
574
  phi::DenseTensor* tensor_;
Y
Yi Wang 已提交
575 576 577
  platform::Place place_;
};

578
void TensorFromStream(std::istream& is,
579
                      phi::DenseTensor* tensor,
T
tangwei12 已提交
580
                      const platform::DeviceContext& dev_ctx,
581 582
                      const size_t& seek,
                      const std::vector<int64_t>& shape) {
T
tangwei12 已提交
583 584 585 586
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));

  PADDLE_ENFORCE_EQ(
587 588
      version,
      0U,
T
tangwei12 已提交
589 590 591 592 593 594 595 596 597 598 599 600
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));

  proto::VarType::TensorDesc desc;
  {  // int32_t size
    // proto buffer
    int32_t size;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(
601 602
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
603 604 605
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
  }
  {  // read tensor
606
    tensor->Resize(phi::make_ddim(shape));
T
tangwei12 已提交
607 608 609 610
    size_t seekg = seek * framework::SizeOfType(desc.data_type());
    is.seekg(seekg, is.cur);

    void* buf;
L
Leo Chen 已提交
611
    phi::CPUContext ctx;
T
tangwei12 已提交
612
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
613
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
614
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
615
        platform::is_custom_place(dev_ctx.GetPlace())) {
616
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
617
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
618
      phi::DenseTensor cpu_tensor;
619
      cpu_tensor.Resize(phi::make_ddim(shape));
T
tangwei12 已提交
620 621 622 623 624 625
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
张春乔 已提交
626
      if (platform::is_custom_place(dev_ctx.GetPlace())) {
627 628
        dev_ctx.Wait();
      }
T
tangwei12 已提交
629
#else
630 631 632
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
633
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
634 635 636
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
      }
T
tangwei12 已提交
637 638 639 640 641 642 643 644 645 646
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
    }
  }
}

647
void TensorFromStream(std::istream& is,
648
                      phi::DenseTensor* tensor,
Y
Yi Wang 已提交
649 650 651
                      const platform::DeviceContext& dev_ctx) {
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));
T
tangwei12 已提交
652
  PADDLE_ENFORCE_EQ(
653 654
      version,
      0U,
T
tangwei12 已提交
655 656 657
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));
Y
Yi Wang 已提交
658 659 660
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
Z
zlsh80826 已提交
661
    int32_t size = -1;
Y
Yi Wang 已提交
662
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
663
    PADDLE_ENFORCE_EQ(
664 665
        is.good(),
        true,
666
        platform::errors::Unavailable("Cannot read tensor desc size"));
667 668 669 670
    PADDLE_ENFORCE_GE(size,
                      0,
                      platform::errors::InvalidArgument(
                          "phi::DenseTensor desc size should >= 0"));
Y
Yi Wang 已提交
671 672
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
T
tangwei12 已提交
673
    PADDLE_ENFORCE_EQ(
674 675
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
676
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
Y
Yi Wang 已提交
677 678 679 680 681
  }
  {  // read tensor
    std::vector<int64_t> dims;
    dims.reserve(static_cast<size_t>(desc.dims().size()));
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
682
    tensor->Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
683
    void* buf;
L
Leo Chen 已提交
684
    phi::CPUContext ctx;
Y
Yu Yang 已提交
685
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
686
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
687
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
688
        platform::is_custom_place(dev_ctx.GetPlace())) {
689
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
690
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
691
      phi::DenseTensor cpu_tensor;
692
      cpu_tensor.Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
693 694 695
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
Y
yuyang18 已提交
696
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
697 698
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
张春乔 已提交
699
      if (platform::is_custom_place(dev_ctx.GetPlace())) {
700 701
        dev_ctx.Wait();
      }
Y
Yi Wang 已提交
702
#else
703 704 705
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
706
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
707 708
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
709 710 711
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CutomPlace is not supported when not compiled with CustomDevice"));
712
      }
Y
Yi Wang 已提交
713 714 715 716 717
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
Y
yuyang18 已提交
718
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
719 720 721 722
    }
  }
}

6
633WHU 已提交
723
// get tensor data point by DLDataType
724
void* GetDstPtrByDLDataType(DLDataType type,
725
                            phi::DenseTensor* dst,
6
633WHU 已提交
726 727
                            const platform::Place& dst_place) {
  // vector types not currently supported
728 729
  PADDLE_ENFORCE_LE(type.lanes,
                    1,
730 731
                    platform::errors::Unimplemented(
                        "Vector type is not supported currently."));
6
633WHU 已提交
732 733 734 735 736 737 738

  switch (type.bits) {
    case 8:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
      if (type.code == kDLUInt)
        return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
739 740
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
741 742
          type.code,
          type.bits));
6
633WHU 已提交
743 744 745 746 747 748
    case 16:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::float16>(dst_place));
S
Siming Dai 已提交
749 750 751
      if (type.code == kDLBfloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::bfloat16>(dst_place));
752 753
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
754 755
          type.code,
          type.bits));
6
633WHU 已提交
756 757 758 759 760
    case 32:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<float>(dst_place));
761 762
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
763 764
          type.code,
          type.bits));
6
633WHU 已提交
765 766 767 768 769
    case 64:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<double>(dst_place));
S
Siming Dai 已提交
770 771 772 773 774
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<float>>(dst_place));
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
775 776
          type.code,
          type.bits));
S
Siming Dai 已提交
777 778 779 780
    case 128:
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<double>>(dst_place));
781 782
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
783 784
          type.code,
          type.bits));
6
633WHU 已提交
785
    default:
786 787
      PADDLE_THROW(platform::errors::Unimplemented(
          "Unsupported DLDataType.bits %d.", type.bits));
6
633WHU 已提交
788 789 790
  }
}

791
void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
6
633WHU 已提交
792 793 794 795
  platform::CPUPlace dst_place = platform::CPUPlace();
  platform::CPUPlace src_place = platform::CPUPlace();

  std::vector<int64_t> vec;
796 797
  std::copy(dl_tensor.shape,
            dl_tensor.shape + dl_tensor.ndim,
6
633WHU 已提交
798 799
            std::back_inserter(vec));

800
  framework::DDim vddim = phi::make_ddim(vec);
6
633WHU 已提交
801 802 803 804 805 806

  dst->Resize(vddim);
  ::DLDataType type = dl_tensor.dtype;
  void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);

  auto src_ptr = static_cast<const void*>(dl_tensor.data);
807
  auto size = phi::product(vddim) * type.bits / 8;
6
633WHU 已提交
808

S
Siming Dai 已提交
809
  if (dl_tensor.device.device_type == kDLCPU) {
6
633WHU 已提交
810 811
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
812
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
S
Siming Dai 已提交
813
  if (dl_tensor.device.device_type == kDLGPU) {
6
633WHU 已提交
814
    platform::CUDAPlace dst_place =
S
Siming Dai 已提交
815
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
816
    platform::CUDAPlace src_place =
S
Siming Dai 已提交
817
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
818 819
    dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
L
Leo Chen 已提交
820 821 822 823 824 825
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
6
633WHU 已提交
826 827
  }
#endif
828 829 830
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
6
633WHU 已提交
831 832
}

S
Siming Dai 已提交
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874
void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
  std::vector<int64_t> vec;
  std::copy(src->dl_tensor.shape,
            src->dl_tensor.shape + src->dl_tensor.ndim,
            std::back_inserter(vec));

  framework::DDim vddim = phi::make_ddim(vec);
  dst->Resize(vddim);
  ::DLDataType type = src->dl_tensor.dtype;

  auto src_ptr = static_cast<const void*>(src->dl_tensor.data);
  auto size = phi::product(vddim) * type.bits / 8;

  if (src->dl_tensor.device.device_type == kDLCPU) {
    platform::CPUPlace dst_place = platform::CPUPlace();
    platform::CPUPlace src_place = platform::CPUPlace();
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (src->dl_tensor.device.device_type == kDLGPU) {
    platform::CUDAPlace dst_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    platform::CUDAPlace src_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
    // Fix copy by share allocation.
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
  }
#endif
  src->deleter(const_cast<DLManagedTensor*>(src));
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
}

875
template <typename T>
876
std::string format_tensor(const phi::DenseTensor& tensor) {
877 878 879 880
  // TODO(zhiqiu): use the print option to format tensor.
  return "NOT IMPLEMENTED";
}

881
template <typename T>
882
std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
883 884 885
  auto inspect = tensor.data<T>();
  auto element_num = tensor.numel();

886
  os << "  - data: [";
887 888 889 890 891 892 893 894 895 896 897 898 899 900
  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
    if (element_num > 0) {
      os << signed(inspect[0]);
      for (int j = 1; j < element_num; ++j) {
        os << " " << signed(inspect[j]);
      }
    }
  } else {
    if (element_num > 0) {
      os << inspect[0];
      for (int j = 1; j < element_num; ++j) {
        os << " " << inspect[j];
      }
901 902 903 904 905 906
    }
  }
  os << "]";
  return os;
}

907
template <>
908
std::ostream& print_tensor<paddle::platform::complex<float>>(
909
    std::ostream& os, const phi::DenseTensor& tensor) {
910
  auto inspect = tensor.data<paddle::platform::complex<float>>();
911 912 913 914
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
915
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
916
    for (int j = 1; j < element_num; ++j) {
917 918
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
919 920 921 922 923 924 925
    }
  }
  os << "]";
  return os;
}

template <>
926
std::ostream& print_tensor<paddle::platform::complex<double>>(
927
    std::ostream& os, const phi::DenseTensor& tensor) {
928
  auto inspect = tensor.data<paddle::platform::complex<double>>();
929 930 931 932
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
933
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
934
    for (int j = 1; j < element_num; ++j) {
935 936
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
937 938 939 940 941 942
    }
  }
  os << "]";
  return os;
}

943
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
944 945
  // NOTE(xiongkun):
  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
946
  // if we don't redefine, the operator << of phi / framework LoD is not found.
947
  paddle::string::operator<<(os, lod);
948 949 950
  return os;
}

951 952 953
}  // namespace framework
}  // namespace paddle

954
namespace phi {
955

956 957 958 959 960
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  paddle::string::operator<<(os, lod);
  return os;
}

961
std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
962
  if (!t.lod().empty()) {
963 964 965
    os << "  - lod: " << t.lod() << "\n";
  }

966 967
  os << "  - place: " << t.place() << "\n";
  os << "  - shape: [" << t.dims() << "]\n";
968
  os << "  - layout: " << phi::DataLayoutToString(t.layout()) << "\n";
969

970
  DenseTensor tensor;
971
  tensor.Resize(t.dims());
972
  if (paddle::platform::is_cpu_place(t.place())) {
973 974
    tensor.ShareDataWith(t);
  } else {
975 976 977 978
    paddle::platform::CPUPlace place;
    paddle::framework::TensorCopy(t, place, &tensor);
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
979 980 981 982
    auto& dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
  }

983 984 985 986
#define PrintTensorCallback(cpp_type, proto_type)                 \
  do {                                                            \
    if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
        proto_type) {                                             \
987
      os << "  - dtype: " << tensor.dtype() << "\n";              \
988 989 990
      paddle::framework::print_tensor<cpp_type>(os, tensor);      \
      return os;                                                  \
    }                                                             \
991 992 993 994 995 996
  } while (0)

  _ForEachDataType_(PrintTensorCallback);
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
}
997
}  // namespace phi