tensor_util.cc 38.1 KB
Newer Older
Y
Yang Yu 已提交
1 2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yang Yu 已提交
6

7 8 9 10 11 12 13
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yang Yu 已提交
14

15 16
#include "paddle/fluid/framework/tensor_util.h"

C
chengduoZH 已提交
17 18
#include <algorithm>
#include <limits>
C
chengduo 已提交
19
#include <memory>
20
#include <string>
C
chengduo 已提交
21
#include <utility>
C
chengduoZH 已提交
22
#include <vector>
23

24
#include "paddle/fluid/framework/convert_utils.h"
Y
yuyang18 已提交
25
#include "paddle/fluid/framework/data_type.h"
26
#include "paddle/fluid/platform/complex.h"
27
#include "paddle/fluid/platform/profiler/event_tracing.h"
28
#include "paddle/phi/core/dense_tensor.h"
29

30
#ifdef PADDLE_WITH_MKLDNN
31
#include "dnnl_debug.h"  // NOLINT
32
#endif
Y
Yang Yu 已提交
33 34 35

namespace paddle {
namespace framework {
Y
Yi Wang 已提交
36

37
template <typename TENSOR>
38 39 40 41
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
                    const platform::DeviceContext& ctx,
                    TENSOR* dst) {
42 43
  if (&src == dst) {
    auto src_copy = src;
44
    TensorCopyImpl(src_copy, dst_place, ctx, dst);
45 46 47
    return;
  }

M
minqiyang 已提交
48 49
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
Y
Yi Wang 已提交
50 51 52 53
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
  auto src_place = src.place();
54
  auto src_ptr = src.data();
55
#ifdef PADDLE_WITH_MKLDNN
56
  dst->set_mem_desc(src.mem_desc());
57 58 59
  // oneDNN tensors due to padding may be of bigger size
  // than numel()*size(type())
  auto dst_ptr =
60
      src.layout() == DataLayout::ONEDNN
61 62
          ? dst->mutable_data(dst_place, src.dtype(), src.memory_size())
          : dst->mutable_data(dst_place, src.dtype());
63
#else
64
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
65
#endif
66
  dst->set_layout(src.layout());
67 68 69 70 71
  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
            << dst_place;
    return;
  }
72
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
73

74
#ifdef PADDLE_WITH_MKLDNN
75
  auto size = src.layout() == DataLayout::ONEDNN
76
                  ? src.memory_size()
77
                  : src.numel() * phi::SizeOf(src.dtype());
78
#else
79
  auto size = src.numel() * phi::SizeOf(src.dtype());
80
#endif
Y
Yi Wang 已提交
81 82

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
83
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
Y
Yi Wang 已提交
84
  }
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_custom_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
#endif
108 109 110
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
111
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
112 113
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
114
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
115 116 117 118 119 120 121
  } else if (platform::is_xpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
122
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
123 124 125 126 127
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
128
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
129 130
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
131
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
132
  }
133
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
Y
Yi Wang 已提交
134
           platform::is_cpu_place(dst_place)) {
135
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
136 137 138
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
139
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
140 141 142
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
143 144
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
Y
Yi Wang 已提交
145
    auto ctx_place = ctx.GetPlace();
146
    PADDLE_ENFORCE_EQ(
147 148
        platform::is_gpu_place(ctx_place),
        true,
149 150 151
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
152
    auto ctx_gpu_place = ctx_place;
153 154
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
155 156 157
                      platform::errors::Unavailable(
                          "Source place and context place do not match, source "
                          "place is %s, context place is %s.",
158 159
                          src_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
160
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
161
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
162 163 164
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
165 166
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
167
    auto ctx_place = ctx.GetPlace();
168
    PADDLE_ENFORCE_EQ(
169 170
        platform::is_gpu_place(ctx_place),
        true,
171 172 173
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
174
    auto ctx_gpu_place = ctx_place;
175 176
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
177 178 179
                      platform::errors::Unavailable(
                          "Destination place and context place do not match, "
                          "destination place is %s, context place is %s.",
180 181
                          dst_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
182
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
183
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
184 185 186
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
187 188
    auto src_gpu_place = src_place;
    auto dst_cuda_pinned_place = dst_place;
189
    auto ctx_place = ctx.GetPlace();
190 191 192 193 194 195 196
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from GPU memory to CUDA Pinned memory, current "
            "device context place should be GPU."));
197
    auto ctx_gpu_place = ctx_place;
198 199
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
200 201 202 203
                      platform::errors::PreconditionNotMet(
                          "The source GPU device and current device context do "
                          "not match. The source GPU device number is %d, but "
                          "device context GPU number is %d.",
204 205
                          src_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
206
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
207 208
    memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
209 210 211
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
212 213
    auto src_cuda_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
214
    auto ctx_place = ctx.GetPlace();
215 216 217 218 219 220 221
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from CUDA Pinned memory to GPU memory, current "
            "device context place should be GPU."));
222
    auto ctx_gpu_place = ctx_place;
223 224
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
225 226 227 228
                      platform::errors::PreconditionNotMet(
                          "The target GPU device and current device context do "
                          "not match. The target GPU device number is %d, but "
                          "device context GPU number is %d.",
229 230
                          dst_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
231
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
232 233
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
234 235 236
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
237 238
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
239
    auto ctx_place = ctx.GetPlace();
240
    PADDLE_ENFORCE_EQ(
241 242
        platform::is_gpu_place(ctx_place),
        true,
243 244 245
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
L
Leo Chen 已提交
246
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
C
chengduo 已提交
247
    if (platform::is_same_place(src_place, dst_place)) {
248 249
      memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
250 251
    } else {
      if (platform::is_same_place(ctx_place, src_place)) {
252 253
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
254
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
C
chengduo 已提交
255
      } else if (platform::is_same_place(ctx_place, dst_place)) {
C
chengduo 已提交
256
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
257 258
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
259
      } else {
260 261
        PADDLE_THROW(platform::errors::Unavailable(
            "Context place dose not match the source and destination place."));
C
chengduo 已提交
262 263
      }
    }
264 265
  }
  else {  // NOLINT
266 267
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
Y
Yi Wang 已提交
268 269 270 271
  }
#endif
}

272
template <typename TENSOR>
273 274
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
275
                    TENSOR* dst) {
Y
Yi Wang 已提交
276 277
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
278
  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
279 280
      platform::is_mlu_place(dst_place) ||
      platform::is_custom_place(dst_place)) {
Y
Yi Wang 已提交
281
    dev_ctx = pool.Get(dst_place);
C
chengduo 已提交
282 283
  } else {
    dev_ctx = pool.Get(src.place());
Y
Yi Wang 已提交
284
  }
285 286 287
  TensorCopyImpl(src, dst_place, *dev_ctx, dst);
}

288
void TensorCopy(const phi::DenseTensor& src,
289
                const platform::Place& dst_place,
290 291
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, dst);
292
}
293
void TensorCopy(const phi::DenseTensor& src,
294 295
                const platform::Place& dst_place,
                const platform::DeviceContext& ctx,
296 297
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, ctx, dst);
298
}
Y
Yi Wang 已提交
299

300
void TensorCopySync(const phi::DenseTensor& src,
301
                    const platform::Place& dst_place,
302
                    phi::DenseTensor* dst) {
303 304 305 306 307 308
  if (&src == dst) {
    auto src_copy = src;
    TensorCopySync(src_copy, dst_place, dst);
    return;
  }

M
minqiyang 已提交
309 310
  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
          << " to " << dst_place;
F
fengjiayi 已提交
311 312 313
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
J
Jacek Czaja 已提交
314
#ifdef PADDLE_WITH_MKLDNN
315
  if (src.layout() == DataLayout::ONEDNN) {
316 317
    dst->set_mem_desc(src.mem_desc());
  }
J
Jacek Czaja 已提交
318
#endif
F
fengjiayi 已提交
319
  auto src_place = src.place();
320
  auto src_ptr = src.data();
321
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
322
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
323 324 325 326 327 328 329

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data from " << src_place << " to "
            << dst_place;
    return;
  }

330
  auto size = src.numel() * phi::SizeOf(src.dtype());
F
fengjiayi 已提交
331
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
332
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
F
fengjiayi 已提交
333
  }
334 335 336 337
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
338
  }                                                // NOLINT
339 340 341
  else if (platform::is_cpu_place(src_place) &&    // NOLINT
           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
342
  }                                                 // NOLINT
343 344 345 346 347 348 349 350 351 352 353
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_custom_place(
               dst_place)) { /* custom_device -> custom_device*/
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  }
#endif
354 355 356
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
357
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
358
  }                                              // NOLINT
J
jianghaicheng 已提交
359 360
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
361
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
362
  }                                              // NOLINT
J
jianghaicheng 已提交
363 364
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
365 366 367 368 369
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
370 371 372
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
    platform::XPUPlace xpu_dst_place = dst_place;
    platform::XPUPlace xpu_src_place = src_place;
373 374 375 376
    if (xpu_dst_place.device == xpu_src_place.device) {
      auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
      xpu_ctx->Wait();
    }
A
Allen Guo 已提交
377
  }       // NOLINT
J
jianghaicheng 已提交
378
  else {  // NOLINT
379 380 381 382
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
383
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
384 385
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
386
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
387
  }
388
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
F
fengjiayi 已提交
389
           platform::is_cpu_place(dst_place)) {
390
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
391 392 393
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
394
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
395 396 397
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
398
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
399 400 401
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
402 403
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
F
fengjiayi 已提交
404
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
405 406 407
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
408 409
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
410
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
411 412 413
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
414 415
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
416
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
417 418 419
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
420 421
    auto src_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
422 423
    memory::Copy(
        dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr);
424 425
  }
  else {  // NOLINT
426 427
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
F
fengjiayi 已提交
428 429
  }
#endif
A
Allen Guo 已提交
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
#ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
F
fengjiayi 已提交
453 454
}

455
void TensorToStream(std::ostream& os,
456
                    const phi::DenseTensor& tensor,
Y
Yi Wang 已提交
457 458 459 460 461 462 463 464 465
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
  }
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
    proto::VarType::TensorDesc desc;
466
    desc.set_data_type(framework::TransToProtoVarType(tensor.dtype()));
467
    auto dims = phi::vectorize(tensor.dims());
Y
Yi Wang 已提交
468 469 470 471 472 473 474 475 476
    auto* pb_dims = desc.mutable_dims();
    pb_dims->Resize(static_cast<int>(dims.size()), 0);
    std::copy(dims.begin(), dims.end(), pb_dims->begin());
    int32_t size = desc.ByteSize();
    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
    auto out = desc.SerializeAsString();
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
477
    uint64_t size = tensor.numel() * phi::SizeOf(tensor.dtype());
Y
yuyang18 已提交
478

479
    auto* data_ptr = tensor.data();
480 481
    PADDLE_ENFORCE_LT(size,
                      (std::numeric_limits<std::streamsize>::max)(),
T
tangwei12 已提交
482 483
                      platform::errors::ResourceExhausted(
                          "tensor size %d overflow when writing tensor", size));
Y
Yi Wang 已提交
484
    if (platform::is_gpu_place(tensor.place())) {
485
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yi Wang 已提交
486 487
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
L
Leo Chen 已提交
488
      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
Y
Yi Wang 已提交
489 490 491 492
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
493 494 495 496 497
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
Y
Yi Wang 已提交
498 499 500 501 502 503 504
                     gpu_dev_ctx.stream());
        gpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
T
tangwei12 已提交
505 506
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported when not compiled with CUDA"));
507 508 509 510 511 512 513 514 515 516 517
#endif
    } else if (platform::is_xpu_place(tensor.place())) {
#ifdef PADDLE_WITH_XPU
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& xpu_dev_ctx =
          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
518 519 520 521 522
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write);
523 524 525 526 527 528 529 530
        xpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
531 532 533 534 535 536 537 538 539 540 541
#endif
    } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& custom_device_context =
          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
542 543 544 545 546
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
547 548 549 550 551 552 553 554 555 556
                     custom_device_context.stream());
        custom_device_context.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported when not compiled with "
          "CustomDevice"));
Y
Yi Wang 已提交
557 558 559 560 561 562 563 564 565
#endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
               static_cast<std::streamsize>(size));
    }
  }
}

struct DeserializedDataFunctor {
566
  DeserializedDataFunctor(void** buf,
567
                          phi::DenseTensor* tensor,
Y
Yi Wang 已提交
568 569 570 571
                          const platform::Place& place)
      : buf_(buf), tensor_(tensor), place_(place) {}

  template <typename T>
D
dzhwinter 已提交
572
  void apply() {
Y
Yi Wang 已提交
573 574 575 576
    *buf_ = tensor_->mutable_data<T>(place_);
  }

  void** buf_;
577
  phi::DenseTensor* tensor_;
Y
Yi Wang 已提交
578 579 580
  platform::Place place_;
};

581
void TensorFromStream(std::istream& is,
582
                      phi::DenseTensor* tensor,
T
tangwei12 已提交
583
                      const platform::DeviceContext& dev_ctx,
584 585
                      const size_t& seek,
                      const std::vector<int64_t>& shape) {
T
tangwei12 已提交
586 587 588 589
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));

  PADDLE_ENFORCE_EQ(
590 591
      version,
      0U,
T
tangwei12 已提交
592 593 594 595 596 597 598 599 600 601 602 603
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));

  proto::VarType::TensorDesc desc;
  {  // int32_t size
    // proto buffer
    int32_t size;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(
604 605
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
606 607 608
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
  }
  {  // read tensor
609
    tensor->Resize(phi::make_ddim(shape));
T
tangwei12 已提交
610 611 612 613
    size_t seekg = seek * framework::SizeOfType(desc.data_type());
    is.seekg(seekg, is.cur);

    void* buf;
L
Leo Chen 已提交
614
    phi::CPUContext ctx;
T
tangwei12 已提交
615
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
616
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
617
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
618
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
619 620
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
621
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
622
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
623
      phi::DenseTensor cpu_tensor;
624
      cpu_tensor.Resize(phi::make_ddim(shape));
T
tangwei12 已提交
625 626 627 628 629 630
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
631 632
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
633 634
        dev_ctx.Wait();
      }
T
tangwei12 已提交
635
#else
636 637 638
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
639
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
640 641 642
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
      }
T
tangwei12 已提交
643 644 645 646 647 648 649 650 651 652
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
    }
  }
}

653
void TensorFromStream(std::istream& is,
654
                      phi::DenseTensor* tensor,
Y
Yi Wang 已提交
655 656 657
                      const platform::DeviceContext& dev_ctx) {
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));
T
tangwei12 已提交
658
  PADDLE_ENFORCE_EQ(
659 660
      version,
      0U,
T
tangwei12 已提交
661 662 663
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));
Y
Yi Wang 已提交
664 665 666
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
Z
zlsh80826 已提交
667
    int32_t size = -1;
Y
Yi Wang 已提交
668
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
669
    PADDLE_ENFORCE_EQ(
670 671
        is.good(),
        true,
672
        platform::errors::Unavailable("Cannot read tensor desc size"));
673 674 675 676
    PADDLE_ENFORCE_GE(size,
                      0,
                      platform::errors::InvalidArgument(
                          "phi::DenseTensor desc size should >= 0"));
Y
Yi Wang 已提交
677 678
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
T
tangwei12 已提交
679
    PADDLE_ENFORCE_EQ(
680 681
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
682
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
Y
Yi Wang 已提交
683 684 685 686 687
  }
  {  // read tensor
    std::vector<int64_t> dims;
    dims.reserve(static_cast<size_t>(desc.dims().size()));
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
688
    tensor->Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
689
    void* buf;
L
Leo Chen 已提交
690
    phi::CPUContext ctx;
Y
Yu Yang 已提交
691
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
692
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
693
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
694
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
695 696
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
697
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
W
Wang Xin 已提交
698
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
699
      phi::DenseTensor cpu_tensor;
700
      cpu_tensor.Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
701 702 703
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
Y
yuyang18 已提交
704
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
705 706
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
707 708
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
709 710
        dev_ctx.Wait();
      }
Y
Yi Wang 已提交
711
#else
712 713 714
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
715
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
716 717
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
718
      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
719 720
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));
721 722 723
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CutomPlace is not supported when not compiled with CustomDevice"));
724
      }
Y
Yi Wang 已提交
725 726 727 728 729
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
Y
yuyang18 已提交
730
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
731 732 733 734
    }
  }
}

6
633WHU 已提交
735
// get tensor data point by DLDataType
736
void* GetDstPtrByDLDataType(DLDataType type,
737
                            phi::DenseTensor* dst,
6
633WHU 已提交
738 739
                            const platform::Place& dst_place) {
  // vector types not currently supported
740 741
  PADDLE_ENFORCE_LE(type.lanes,
                    1,
742 743
                    platform::errors::Unimplemented(
                        "Vector type is not supported currently."));
6
633WHU 已提交
744 745 746 747 748 749 750

  switch (type.bits) {
    case 8:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
      if (type.code == kDLUInt)
        return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
751 752
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
753 754
          type.code,
          type.bits));
6
633WHU 已提交
755 756 757 758 759 760
    case 16:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::float16>(dst_place));
S
Siming Dai 已提交
761 762 763
      if (type.code == kDLBfloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::bfloat16>(dst_place));
764 765
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
766 767
          type.code,
          type.bits));
6
633WHU 已提交
768 769 770 771 772
    case 32:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<float>(dst_place));
773 774
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
775 776
          type.code,
          type.bits));
6
633WHU 已提交
777 778 779 780 781
    case 64:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<double>(dst_place));
S
Siming Dai 已提交
782 783 784 785 786
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<float>>(dst_place));
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
787 788
          type.code,
          type.bits));
S
Siming Dai 已提交
789 790 791 792
    case 128:
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<double>>(dst_place));
793 794
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
795 796
          type.code,
          type.bits));
6
633WHU 已提交
797
    default:
798 799
      PADDLE_THROW(platform::errors::Unimplemented(
          "Unsupported DLDataType.bits %d.", type.bits));
6
633WHU 已提交
800 801 802
  }
}

803
void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
6
633WHU 已提交
804 805 806 807
  platform::CPUPlace dst_place = platform::CPUPlace();
  platform::CPUPlace src_place = platform::CPUPlace();

  std::vector<int64_t> vec;
808 809
  std::copy(dl_tensor.shape,
            dl_tensor.shape + dl_tensor.ndim,
6
633WHU 已提交
810 811
            std::back_inserter(vec));

812
  framework::DDim vddim = phi::make_ddim(vec);
6
633WHU 已提交
813 814 815 816 817 818

  dst->Resize(vddim);
  ::DLDataType type = dl_tensor.dtype;
  void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);

  auto src_ptr = static_cast<const void*>(dl_tensor.data);
819
  auto size = phi::product(vddim) * type.bits / 8;
6
633WHU 已提交
820

S
Siming Dai 已提交
821
  if (dl_tensor.device.device_type == kDLCPU) {
6
633WHU 已提交
822 823
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
824
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
S
Siming Dai 已提交
825
  if (dl_tensor.device.device_type == kDLGPU) {
6
633WHU 已提交
826
    platform::CUDAPlace dst_place =
S
Siming Dai 已提交
827
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
828
    platform::CUDAPlace src_place =
S
Siming Dai 已提交
829
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
830 831
    dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
L
Leo Chen 已提交
832 833 834 835 836 837
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
6
633WHU 已提交
838 839
  }
#endif
840 841 842
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
6
633WHU 已提交
843 844
}

S
Siming Dai 已提交
845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
  std::vector<int64_t> vec;
  std::copy(src->dl_tensor.shape,
            src->dl_tensor.shape + src->dl_tensor.ndim,
            std::back_inserter(vec));

  framework::DDim vddim = phi::make_ddim(vec);
  dst->Resize(vddim);
  ::DLDataType type = src->dl_tensor.dtype;

  auto src_ptr = static_cast<const void*>(src->dl_tensor.data);
  auto size = phi::product(vddim) * type.bits / 8;

  if (src->dl_tensor.device.device_type == kDLCPU) {
    platform::CPUPlace dst_place = platform::CPUPlace();
    platform::CPUPlace src_place = platform::CPUPlace();
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (src->dl_tensor.device.device_type == kDLGPU) {
    platform::CUDAPlace dst_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    platform::CUDAPlace src_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
    // Fix copy by share allocation.
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
  }
#endif
  src->deleter(const_cast<DLManagedTensor*>(src));
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
}

887
template <typename T>
888
std::string format_tensor(const phi::DenseTensor& tensor) {
889 890 891 892
  // TODO(zhiqiu): use the print option to format tensor.
  return "NOT IMPLEMENTED";
}

893
template <typename T>
894
std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
895 896 897
  auto inspect = tensor.data<T>();
  auto element_num = tensor.numel();

898
  os << "  - data: [";
899 900 901 902 903 904 905 906 907 908 909 910 911 912
  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
    if (element_num > 0) {
      os << signed(inspect[0]);
      for (int j = 1; j < element_num; ++j) {
        os << " " << signed(inspect[j]);
      }
    }
  } else {
    if (element_num > 0) {
      os << inspect[0];
      for (int j = 1; j < element_num; ++j) {
        os << " " << inspect[j];
      }
913 914 915 916 917 918
    }
  }
  os << "]";
  return os;
}

919
template <>
920
std::ostream& print_tensor<paddle::platform::complex<float>>(
921
    std::ostream& os, const phi::DenseTensor& tensor) {
922
  auto inspect = tensor.data<paddle::platform::complex<float>>();
923 924 925 926
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
927
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
928
    for (int j = 1; j < element_num; ++j) {
929 930
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
931 932 933 934 935 936 937
    }
  }
  os << "]";
  return os;
}

template <>
938
std::ostream& print_tensor<paddle::platform::complex<double>>(
939
    std::ostream& os, const phi::DenseTensor& tensor) {
940
  auto inspect = tensor.data<paddle::platform::complex<double>>();
941 942 943 944
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
945
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
946
    for (int j = 1; j < element_num; ++j) {
947 948
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
949 950 951 952 953 954
    }
  }
  os << "]";
  return os;
}

955
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
956 957
  // NOTE(xiongkun):
  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
958
  // if we don't redefine, the operator << of phi / framework LoD is not found.
959
  paddle::string::operator<<(os, lod);
960 961 962
  return os;
}

963 964 965
}  // namespace framework
}  // namespace paddle

966
namespace phi {
967

968 969 970 971 972
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  paddle::string::operator<<(os, lod);
  return os;
}

973
std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
974 975 976 977
  if (t.lod().size() > 0) {
    os << "  - lod: " << t.lod() << "\n";
  }

978 979
  os << "  - place: " << t.place() << "\n";
  os << "  - shape: [" << t.dims() << "]\n";
980
  os << "  - layout: " << phi::DataLayoutToString(t.layout()) << "\n";
981

982
  DenseTensor tensor;
983
  tensor.Resize(t.dims());
984
  if (paddle::platform::is_cpu_place(t.place())) {
985 986
    tensor.ShareDataWith(t);
  } else {
987 988 989 990
    paddle::platform::CPUPlace place;
    paddle::framework::TensorCopy(t, place, &tensor);
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
991 992 993 994
    auto& dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
  }

995 996 997 998 999 1000 1001 1002
#define PrintTensorCallback(cpp_type, proto_type)                 \
  do {                                                            \
    if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
        proto_type) {                                             \
      os << "  - dtype: " << proto_type << "\n";                  \
      paddle::framework::print_tensor<cpp_type>(os, tensor);      \
      return os;                                                  \
    }                                                             \
1003 1004 1005 1006 1007 1008
  } while (0)

  _ForEachDataType_(PrintTensorCallback);
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
}
1009
}  // namespace phi