tensor_util.cc 38.0 KB
Newer Older
Y
Yang Yu 已提交
1 2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yang Yu 已提交
6

7 8 9 10 11 12 13
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yang Yu 已提交
14

15 16
#include "paddle/fluid/framework/tensor_util.h"

C
chengduoZH 已提交
17 18
#include <algorithm>
#include <limits>
C
chengduo 已提交
19
#include <memory>
20
#include <string>
C
chengduo 已提交
21
#include <utility>
C
chengduoZH 已提交
22
#include <vector>
23

24
#include "paddle/fluid/framework/convert_utils.h"
Y
yuyang18 已提交
25
#include "paddle/fluid/framework/data_type.h"
26
#include "paddle/fluid/platform/complex.h"
27
#include "paddle/fluid/platform/profiler/event_tracing.h"
28
#include "paddle/phi/core/dense_tensor.h"
29

30
#ifdef PADDLE_WITH_MKLDNN
31
#include "dnnl_debug.h"  // NOLINT
32
#endif
Y
Yang Yu 已提交
33 34 35

namespace paddle {
namespace framework {
Y
Yi Wang 已提交
36

37
template <typename TENSOR>
38 39 40 41
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
                    const platform::DeviceContext& ctx,
                    TENSOR* dst) {
42 43
  if (&src == dst) {
    auto src_copy = src;
44
    TensorCopyImpl(src_copy, dst_place, ctx, dst);
45 46 47
    return;
  }

M
minqiyang 已提交
48 49
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
Y
Yi Wang 已提交
50 51 52 53
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
  auto src_place = src.place();
54
  auto src_ptr = src.data();
55
#ifdef PADDLE_WITH_MKLDNN
56
  dst->set_mem_desc(src.mem_desc());
57 58 59
  // oneDNN tensors due to padding may be of bigger size
  // than numel()*size(type())
  auto dst_ptr =
60
      src.layout() == DataLayout::ONEDNN
61 62
          ? dst->mutable_data(dst_place, src.dtype(), src.memory_size())
          : dst->mutable_data(dst_place, src.dtype());
63
#else
64
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
65
#endif
66
  dst->set_layout(src.layout());
67 68 69 70 71
  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
            << dst_place;
    return;
  }
72
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
73

74
#ifdef PADDLE_WITH_MKLDNN
75
  auto size = src.layout() == DataLayout::ONEDNN
76
                  ? src.memory_size()
77
                  : src.numel() * phi::SizeOf(src.dtype());
78
#else
79
  auto size = src.numel() * phi::SizeOf(src.dtype());
80
#endif
Y
Yi Wang 已提交
81 82

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
83
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
Y
Yi Wang 已提交
84
  }
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_custom_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
#endif
108 109 110
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
111
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
112 113
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
114
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
115 116 117 118 119 120 121
  } else if (platform::is_xpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
122
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
123 124 125 126 127
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
128
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
129 130
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
131
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
132
  }
133
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
Y
Yi Wang 已提交
134
           platform::is_cpu_place(dst_place)) {
135
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
136 137 138
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
139
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
140 141 142
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
143 144
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
Y
Yi Wang 已提交
145
    auto ctx_place = ctx.GetPlace();
146
    PADDLE_ENFORCE_EQ(
147 148
        platform::is_gpu_place(ctx_place),
        true,
149 150 151
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
152
    auto ctx_gpu_place = ctx_place;
153 154
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
155 156 157
                      platform::errors::Unavailable(
                          "Source place and context place do not match, source "
                          "place is %s, context place is %s.",
158 159
                          src_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
160
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
161
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
162 163 164
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
165 166
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
167
    auto ctx_place = ctx.GetPlace();
168
    PADDLE_ENFORCE_EQ(
169 170
        platform::is_gpu_place(ctx_place),
        true,
171 172 173
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
174
    auto ctx_gpu_place = ctx_place;
175 176
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
177 178 179
                      platform::errors::Unavailable(
                          "Destination place and context place do not match, "
                          "destination place is %s, context place is %s.",
180 181
                          dst_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
182
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
183
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
184 185 186
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
187 188
    auto src_gpu_place = src_place;
    auto dst_cuda_pinned_place = dst_place;
189
    auto ctx_place = ctx.GetPlace();
190 191 192 193 194 195 196
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from GPU memory to CUDA Pinned memory, current "
            "device context place should be GPU."));
197
    auto ctx_gpu_place = ctx_place;
198 199
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
200 201 202 203
                      platform::errors::PreconditionNotMet(
                          "The source GPU device and current device context do "
                          "not match. The source GPU device number is %d, but "
                          "device context GPU number is %d.",
204 205
                          src_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
206
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
207 208
    memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
209 210 211
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
212 213
    auto src_cuda_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
214
    auto ctx_place = ctx.GetPlace();
215 216 217 218 219 220 221
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from CUDA Pinned memory to GPU memory, current "
            "device context place should be GPU."));
222
    auto ctx_gpu_place = ctx_place;
223 224
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
225 226 227 228
                      platform::errors::PreconditionNotMet(
                          "The target GPU device and current device context do "
                          "not match. The target GPU device number is %d, but "
                          "device context GPU number is %d.",
229 230
                          dst_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
231
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
232 233
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
234 235 236
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
237 238
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
239
    auto ctx_place = ctx.GetPlace();
240
    PADDLE_ENFORCE_EQ(
241 242
        platform::is_gpu_place(ctx_place),
        true,
243 244 245
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
L
Leo Chen 已提交
246
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
C
chengduo 已提交
247
    if (platform::is_same_place(src_place, dst_place)) {
248 249
      memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
250 251
    } else {
      if (platform::is_same_place(ctx_place, src_place)) {
252 253
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
254
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
C
chengduo 已提交
255
      } else if (platform::is_same_place(ctx_place, dst_place)) {
C
chengduo 已提交
256
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
257 258
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
259
      } else {
260 261
        PADDLE_THROW(platform::errors::Unavailable(
            "Context place dose not match the source and destination place."));
C
chengduo 已提交
262 263
      }
    }
264 265
  }
  else {  // NOLINT
266 267
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
Y
Yi Wang 已提交
268 269 270 271
  }
#endif
}

272
template <typename TENSOR>
273 274
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
275
                    TENSOR* dst) {
Y
Yi Wang 已提交
276 277
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
278
  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
279
      platform::is_custom_place(dst_place)) {
Y
Yi Wang 已提交
280
    dev_ctx = pool.Get(dst_place);
C
chengduo 已提交
281 282
  } else {
    dev_ctx = pool.Get(src.place());
Y
Yi Wang 已提交
283
  }
284 285 286
  TensorCopyImpl(src, dst_place, *dev_ctx, dst);
}

287
void TensorCopy(const phi::DenseTensor& src,
288
                const platform::Place& dst_place,
289 290
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, dst);
291
}
292
void TensorCopy(const phi::DenseTensor& src,
293 294
                const platform::Place& dst_place,
                const platform::DeviceContext& ctx,
295 296
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, ctx, dst);
297
}
Y
Yi Wang 已提交
298

299
void TensorCopySync(const phi::DenseTensor& src,
300
                    const platform::Place& dst_place,
301
                    phi::DenseTensor* dst) {
302 303 304 305 306 307
  if (&src == dst) {
    auto src_copy = src;
    TensorCopySync(src_copy, dst_place, dst);
    return;
  }

M
minqiyang 已提交
308 309
  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
          << " to " << dst_place;
F
fengjiayi 已提交
310 311 312
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
J
Jacek Czaja 已提交
313
#ifdef PADDLE_WITH_MKLDNN
314
  if (src.layout() == DataLayout::ONEDNN) {
315 316
    dst->set_mem_desc(src.mem_desc());
  }
J
Jacek Czaja 已提交
317
#endif
F
fengjiayi 已提交
318
  auto src_place = src.place();
319
  auto src_ptr = src.data();
320
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
321
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
322 323 324 325 326 327 328

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data from " << src_place << " to "
            << dst_place;
    return;
  }

329
  auto size = src.numel() * phi::SizeOf(src.dtype());
F
fengjiayi 已提交
330
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
331
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
F
fengjiayi 已提交
332
  }
333 334 335 336
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
337
  }                                                // NOLINT
338 339 340
  else if (platform::is_cpu_place(src_place) &&    // NOLINT
           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
341
  }                                                 // NOLINT
342 343 344 345 346 347 348 349 350 351 352
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_custom_place(
               dst_place)) { /* custom_device -> custom_device*/
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  }
#endif
353 354 355
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
356
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
357
  }                                              // NOLINT
J
jianghaicheng 已提交
358 359
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
360
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
361
  }                                              // NOLINT
J
jianghaicheng 已提交
362 363
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
364 365 366 367 368
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
369 370 371
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
    platform::XPUPlace xpu_dst_place = dst_place;
    platform::XPUPlace xpu_src_place = src_place;
372 373 374 375
    if (xpu_dst_place.device == xpu_src_place.device) {
      auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
      xpu_ctx->Wait();
    }
A
Allen Guo 已提交
376
  }       // NOLINT
J
jianghaicheng 已提交
377
  else {  // NOLINT
378 379 380 381
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
382
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
383 384
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
385
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
386
  }
387
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
F
fengjiayi 已提交
388
           platform::is_cpu_place(dst_place)) {
389
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
390 391 392
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
393
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
394 395 396
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
397
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
398 399 400
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
401 402
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
F
fengjiayi 已提交
403
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
404 405 406
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
407 408
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
409
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
410 411 412
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
413 414
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
415
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
416 417 418
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
419 420
    auto src_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
421 422
    memory::Copy(
        dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr);
423 424
  }
  else {  // NOLINT
425 426
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
F
fengjiayi 已提交
427 428
  }
#endif
A
Allen Guo 已提交
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
#ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
F
fengjiayi 已提交
452 453
}

454
void TensorToStream(std::ostream& os,
455
                    const phi::DenseTensor& tensor,
Y
Yi Wang 已提交
456 457 458 459 460 461 462 463 464
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
  }
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
    proto::VarType::TensorDesc desc;
465
    desc.set_data_type(framework::TransToProtoVarType(tensor.dtype()));
466
    auto dims = phi::vectorize(tensor.dims());
Y
Yi Wang 已提交
467 468 469 470 471 472 473 474 475
    auto* pb_dims = desc.mutable_dims();
    pb_dims->Resize(static_cast<int>(dims.size()), 0);
    std::copy(dims.begin(), dims.end(), pb_dims->begin());
    int32_t size = desc.ByteSize();
    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
    auto out = desc.SerializeAsString();
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
476
    uint64_t size = tensor.numel() * phi::SizeOf(tensor.dtype());
Y
yuyang18 已提交
477

478
    auto* data_ptr = tensor.data();
479 480
    PADDLE_ENFORCE_LT(size,
                      (std::numeric_limits<std::streamsize>::max)(),
T
tangwei12 已提交
481 482
                      platform::errors::ResourceExhausted(
                          "tensor size %d overflow when writing tensor", size));
Y
Yi Wang 已提交
483
    if (platform::is_gpu_place(tensor.place())) {
484
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yi Wang 已提交
485 486
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
L
Leo Chen 已提交
487
      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
Y
Yi Wang 已提交
488 489 490 491
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
492 493 494 495 496
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
Y
Yi Wang 已提交
497 498 499 500 501 502 503
                     gpu_dev_ctx.stream());
        gpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
T
tangwei12 已提交
504 505
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported when not compiled with CUDA"));
506 507 508 509 510 511 512 513 514 515 516
#endif
    } else if (platform::is_xpu_place(tensor.place())) {
#ifdef PADDLE_WITH_XPU
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& xpu_dev_ctx =
          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
517 518 519 520 521
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write);
522 523 524 525 526 527 528 529
        xpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
530 531 532 533 534 535 536 537 538 539 540
#endif
    } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& custom_device_context =
          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
541 542 543 544 545
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
546 547 548 549 550 551 552 553 554 555
                     custom_device_context.stream());
        custom_device_context.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported when not compiled with "
          "CustomDevice"));
Y
Yi Wang 已提交
556 557 558 559 560 561 562 563 564
#endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
               static_cast<std::streamsize>(size));
    }
  }
}

struct DeserializedDataFunctor {
565
  DeserializedDataFunctor(void** buf,
566
                          phi::DenseTensor* tensor,
Y
Yi Wang 已提交
567 568 569 570
                          const platform::Place& place)
      : buf_(buf), tensor_(tensor), place_(place) {}

  template <typename T>
D
dzhwinter 已提交
571
  void apply() {
Y
Yi Wang 已提交
572 573 574 575
    *buf_ = tensor_->mutable_data<T>(place_);
  }

  void** buf_;
576
  phi::DenseTensor* tensor_;
Y
Yi Wang 已提交
577 578 579
  platform::Place place_;
};

580
void TensorFromStream(std::istream& is,
581
                      phi::DenseTensor* tensor,
T
tangwei12 已提交
582
                      const platform::DeviceContext& dev_ctx,
583 584
                      const size_t& seek,
                      const std::vector<int64_t>& shape) {
T
tangwei12 已提交
585 586 587 588
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));

  PADDLE_ENFORCE_EQ(
589 590
      version,
      0U,
T
tangwei12 已提交
591 592 593 594 595 596 597 598 599 600 601 602
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));

  proto::VarType::TensorDesc desc;
  {  // int32_t size
    // proto buffer
    int32_t size;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(
603 604
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
605 606 607
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
  }
  {  // read tensor
608
    tensor->Resize(phi::make_ddim(shape));
T
tangwei12 已提交
609 610 611 612
    size_t seekg = seek * framework::SizeOfType(desc.data_type());
    is.seekg(seekg, is.cur);

    void* buf;
L
Leo Chen 已提交
613
    phi::CPUContext ctx;
T
tangwei12 已提交
614
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
615
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
616
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
617 618
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
619
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
620
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
621
      phi::DenseTensor cpu_tensor;
622
      cpu_tensor.Resize(phi::make_ddim(shape));
T
tangwei12 已提交
623 624 625 626 627 628
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
629 630
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
631 632
        dev_ctx.Wait();
      }
T
tangwei12 已提交
633
#else
634 635 636
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
637
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
638 639 640
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
      }
T
tangwei12 已提交
641 642 643 644 645 646 647 648 649 650
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
    }
  }
}

651
void TensorFromStream(std::istream& is,
652
                      phi::DenseTensor* tensor,
Y
Yi Wang 已提交
653 654 655
                      const platform::DeviceContext& dev_ctx) {
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));
T
tangwei12 已提交
656
  PADDLE_ENFORCE_EQ(
657 658
      version,
      0U,
T
tangwei12 已提交
659 660 661
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));
Y
Yi Wang 已提交
662 663 664
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
Z
zlsh80826 已提交
665
    int32_t size = -1;
Y
Yi Wang 已提交
666
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
667
    PADDLE_ENFORCE_EQ(
668 669
        is.good(),
        true,
670
        platform::errors::Unavailable("Cannot read tensor desc size"));
671 672 673 674
    PADDLE_ENFORCE_GE(size,
                      0,
                      platform::errors::InvalidArgument(
                          "phi::DenseTensor desc size should >= 0"));
Y
Yi Wang 已提交
675 676
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
T
tangwei12 已提交
677
    PADDLE_ENFORCE_EQ(
678 679
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
680
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
Y
Yi Wang 已提交
681 682 683 684 685
  }
  {  // read tensor
    std::vector<int64_t> dims;
    dims.reserve(static_cast<size_t>(desc.dims().size()));
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
686
    tensor->Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
687
    void* buf;
L
Leo Chen 已提交
688
    phi::CPUContext ctx;
Y
Yu Yang 已提交
689
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
690
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
691
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
692 693
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
694
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
695
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
696
      phi::DenseTensor cpu_tensor;
697
      cpu_tensor.Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
698 699 700
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
Y
yuyang18 已提交
701
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
702 703
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
704 705
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
706 707
        dev_ctx.Wait();
      }
Y
Yi Wang 已提交
708
#else
709 710 711
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
712
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
713 714
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
715
      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
716 717
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));
718 719 720
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CutomPlace is not supported when not compiled with CustomDevice"));
721
      }
Y
Yi Wang 已提交
722 723 724 725 726
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
Y
yuyang18 已提交
727
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
728 729 730 731
    }
  }
}

6
633WHU 已提交
732
// get tensor data point by DLDataType
733
void* GetDstPtrByDLDataType(DLDataType type,
734
                            phi::DenseTensor* dst,
6
633WHU 已提交
735 736
                            const platform::Place& dst_place) {
  // vector types not currently supported
737 738
  PADDLE_ENFORCE_LE(type.lanes,
                    1,
739 740
                    platform::errors::Unimplemented(
                        "Vector type is not supported currently."));
6
633WHU 已提交
741 742 743 744 745 746 747

  switch (type.bits) {
    case 8:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
      if (type.code == kDLUInt)
        return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
748 749
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
750 751
          type.code,
          type.bits));
6
633WHU 已提交
752 753 754 755 756 757
    case 16:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::float16>(dst_place));
S
Siming Dai 已提交
758 759 760
      if (type.code == kDLBfloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::bfloat16>(dst_place));
761 762
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
763 764
          type.code,
          type.bits));
6
633WHU 已提交
765 766 767 768 769
    case 32:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<float>(dst_place));
770 771
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
772 773
          type.code,
          type.bits));
6
633WHU 已提交
774 775 776 777 778
    case 64:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<double>(dst_place));
S
Siming Dai 已提交
779 780 781 782 783
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<float>>(dst_place));
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
784 785
          type.code,
          type.bits));
S
Siming Dai 已提交
786 787 788 789
    case 128:
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<double>>(dst_place));
790 791
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
792 793
          type.code,
          type.bits));
6
633WHU 已提交
794
    default:
795 796
      PADDLE_THROW(platform::errors::Unimplemented(
          "Unsupported DLDataType.bits %d.", type.bits));
6
633WHU 已提交
797 798 799
  }
}

800
void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
6
633WHU 已提交
801 802 803 804
  platform::CPUPlace dst_place = platform::CPUPlace();
  platform::CPUPlace src_place = platform::CPUPlace();

  std::vector<int64_t> vec;
805 806
  std::copy(dl_tensor.shape,
            dl_tensor.shape + dl_tensor.ndim,
6
633WHU 已提交
807 808
            std::back_inserter(vec));

809
  framework::DDim vddim = phi::make_ddim(vec);
6
633WHU 已提交
810 811 812 813 814 815

  dst->Resize(vddim);
  ::DLDataType type = dl_tensor.dtype;
  void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);

  auto src_ptr = static_cast<const void*>(dl_tensor.data);
816
  auto size = phi::product(vddim) * type.bits / 8;
6
633WHU 已提交
817

S
Siming Dai 已提交
818
  if (dl_tensor.device.device_type == kDLCPU) {
6
633WHU 已提交
819 820
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
821
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
S
Siming Dai 已提交
822
  if (dl_tensor.device.device_type == kDLGPU) {
6
633WHU 已提交
823
    platform::CUDAPlace dst_place =
S
Siming Dai 已提交
824
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
825
    platform::CUDAPlace src_place =
S
Siming Dai 已提交
826
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
827 828
    dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
L
Leo Chen 已提交
829 830 831 832 833 834
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
6
633WHU 已提交
835 836
  }
#endif
837 838 839
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
6
633WHU 已提交
840 841
}

S
Siming Dai 已提交
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
  std::vector<int64_t> vec;
  std::copy(src->dl_tensor.shape,
            src->dl_tensor.shape + src->dl_tensor.ndim,
            std::back_inserter(vec));

  framework::DDim vddim = phi::make_ddim(vec);
  dst->Resize(vddim);
  ::DLDataType type = src->dl_tensor.dtype;

  auto src_ptr = static_cast<const void*>(src->dl_tensor.data);
  auto size = phi::product(vddim) * type.bits / 8;

  if (src->dl_tensor.device.device_type == kDLCPU) {
    platform::CPUPlace dst_place = platform::CPUPlace();
    platform::CPUPlace src_place = platform::CPUPlace();
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (src->dl_tensor.device.device_type == kDLGPU) {
    platform::CUDAPlace dst_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    platform::CUDAPlace src_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
    // Fix copy by share allocation.
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
  }
#endif
  src->deleter(const_cast<DLManagedTensor*>(src));
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
}

884
template <typename T>
885
std::string format_tensor(const phi::DenseTensor& tensor) {
886 887 888 889
  // TODO(zhiqiu): use the print option to format tensor.
  return "NOT IMPLEMENTED";
}

890
template <typename T>
891
std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
892 893 894
  auto inspect = tensor.data<T>();
  auto element_num = tensor.numel();

895
  os << "  - data: [";
896 897 898 899 900 901 902 903 904 905 906 907 908 909
  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
    if (element_num > 0) {
      os << signed(inspect[0]);
      for (int j = 1; j < element_num; ++j) {
        os << " " << signed(inspect[j]);
      }
    }
  } else {
    if (element_num > 0) {
      os << inspect[0];
      for (int j = 1; j < element_num; ++j) {
        os << " " << inspect[j];
      }
910 911 912 913 914 915
    }
  }
  os << "]";
  return os;
}

916
template <>
917
std::ostream& print_tensor<paddle::platform::complex<float>>(
918
    std::ostream& os, const phi::DenseTensor& tensor) {
919
  auto inspect = tensor.data<paddle::platform::complex<float>>();
920 921 922 923
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
924
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
925
    for (int j = 1; j < element_num; ++j) {
926 927
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
928 929 930 931 932 933 934
    }
  }
  os << "]";
  return os;
}

template <>
935
std::ostream& print_tensor<paddle::platform::complex<double>>(
936
    std::ostream& os, const phi::DenseTensor& tensor) {
937
  auto inspect = tensor.data<paddle::platform::complex<double>>();
938 939 940 941
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
942
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
943
    for (int j = 1; j < element_num; ++j) {
944 945
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
946 947 948 949 950 951
    }
  }
  os << "]";
  return os;
}

952
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
953 954
  // NOTE(xiongkun):
  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
955
  // if we don't redefine, the operator << of phi / framework LoD is not found.
956
  paddle::string::operator<<(os, lod);
957 958 959
  return os;
}

960 961 962
}  // namespace framework
}  // namespace paddle

963
namespace phi {
964

965 966 967 968 969
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  paddle::string::operator<<(os, lod);
  return os;
}

970
std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
971 972 973 974
  if (t.lod().size() > 0) {
    os << "  - lod: " << t.lod() << "\n";
  }

975 976
  os << "  - place: " << t.place() << "\n";
  os << "  - shape: [" << t.dims() << "]\n";
977
  os << "  - layout: " << phi::DataLayoutToString(t.layout()) << "\n";
978

979
  DenseTensor tensor;
980
  tensor.Resize(t.dims());
981
  if (paddle::platform::is_cpu_place(t.place())) {
982 983
    tensor.ShareDataWith(t);
  } else {
984 985 986 987
    paddle::platform::CPUPlace place;
    paddle::framework::TensorCopy(t, place, &tensor);
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
988 989 990 991
    auto& dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
  }

992 993 994 995 996 997 998 999
#define PrintTensorCallback(cpp_type, proto_type)                 \
  do {                                                            \
    if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
        proto_type) {                                             \
      os << "  - dtype: " << proto_type << "\n";                  \
      paddle::framework::print_tensor<cpp_type>(os, tensor);      \
      return os;                                                  \
    }                                                             \
1000 1001 1002 1003 1004 1005
  } while (0)

  _ForEachDataType_(PrintTensorCallback);
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
}
1006
}  // namespace phi