tensor_util.cc 42.6 KB
Newer Older
Y
Yang Yu 已提交
1 2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yang Yu 已提交
6

7 8 9 10 11 12 13
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yang Yu 已提交
14

15 16
#include "paddle/fluid/framework/tensor_util.h"

C
chengduoZH 已提交
17 18
#include <algorithm>
#include <limits>
C
chengduo 已提交
19
#include <memory>
20
#include <string>
C
chengduo 已提交
21
#include <utility>
C
chengduoZH 已提交
22
#include <vector>
23

24
#include "paddle/fluid/framework/convert_utils.h"
Y
yuyang18 已提交
25
#include "paddle/fluid/framework/data_type.h"
26
#include "paddle/fluid/platform/complex.h"
27
#include "paddle/fluid/platform/profiler/event_tracing.h"
28
#include "paddle/phi/core/dense_tensor.h"
29

30
#ifdef PADDLE_WITH_MKLDNN
31
#include "dnnl_debug.h"  // NOLINT
32
#endif
Y
Yang Yu 已提交
33 34 35

namespace paddle {
namespace framework {
Y
Yi Wang 已提交
36

37
template <typename TENSOR>
38 39 40 41
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
                    const platform::DeviceContext& ctx,
                    TENSOR* dst) {
42 43
  if (&src == dst) {
    auto src_copy = src;
44
    TensorCopyImpl(src_copy, dst_place, ctx, dst);
45 46 47
    return;
  }

M
minqiyang 已提交
48 49
  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
          << dst_place;
Y
Yi Wang 已提交
50 51 52 53
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
  auto src_place = src.place();
54
  auto src_ptr = src.data();
55
#ifdef PADDLE_WITH_MKLDNN
56
  dst->set_mem_desc(src.mem_desc());
57 58 59
  // oneDNN tensors due to padding may be of bigger size
  // than numel()*size(type())
  auto dst_ptr =
60
      src.layout() == DataLayout::ONEDNN
61 62
          ? dst->mutable_data(dst_place, src.dtype(), src.memory_size())
          : dst->mutable_data(dst_place, src.dtype());
63
#else
64
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
65
#endif
66
  dst->set_layout(src.layout());
67 68 69 70 71
  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
            << dst_place;
    return;
  }
72
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
73

74
#ifdef PADDLE_WITH_MKLDNN
75
  auto size = src.layout() == DataLayout::ONEDNN
76
                  ? src.memory_size()
77
                  : src.numel() * phi::SizeOf(src.dtype());
78
#else
79
  auto size = src.numel() * phi::SizeOf(src.dtype());
80
#endif
Y
Yi Wang 已提交
81 82

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
83
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
Y
Yi Wang 已提交
84
  }
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (platform::is_custom_place(src_place) &&  // NOLINT
             platform::is_custom_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
    auto stream =
        reinterpret_cast<const platform::CustomDeviceContext&>(ctx).stream();
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
#endif
108 109 110
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
111
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
112 113
  } else if (platform::is_cpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
114
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
115 116 117 118 119 120 121
  } else if (platform::is_xpu_place(src_place) &&
             platform::is_xpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
122
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
123 124 125 126 127
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
128
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
129 130
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
131
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
132
  }
133
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
Y
Yi Wang 已提交
134
           platform::is_cpu_place(dst_place)) {
135
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
136 137 138
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
139
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
140 141 142
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
143 144
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
Y
Yi Wang 已提交
145
    auto ctx_place = ctx.GetPlace();
146
    PADDLE_ENFORCE_EQ(
147 148
        platform::is_gpu_place(ctx_place),
        true,
149 150 151
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
152
    auto ctx_gpu_place = ctx_place;
153 154
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
155 156 157
                      platform::errors::Unavailable(
                          "Source place and context place do not match, source "
                          "place is %s, context place is %s.",
158 159
                          src_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
160
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
161
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
162 163 164
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
165 166
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
167
    auto ctx_place = ctx.GetPlace();
168
    PADDLE_ENFORCE_EQ(
169 170
        platform::is_gpu_place(ctx_place),
        true,
171 172 173
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
174
    auto ctx_gpu_place = ctx_place;
175 176
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
177 178 179
                      platform::errors::Unavailable(
                          "Destination place and context place do not match, "
                          "destination place is %s, context place is %s.",
180 181
                          dst_gpu_place,
                          ctx_gpu_place));
L
Leo Chen 已提交
182
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
183
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
184 185 186
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
187 188
    auto src_gpu_place = src_place;
    auto dst_cuda_pinned_place = dst_place;
189
    auto ctx_place = ctx.GetPlace();
190 191 192 193 194 195 196
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from GPU memory to CUDA Pinned memory, current "
            "device context place should be GPU."));
197
    auto ctx_gpu_place = ctx_place;
198 199
    PADDLE_ENFORCE_EQ(src_gpu_place,
                      ctx_gpu_place,
200 201 202 203
                      platform::errors::PreconditionNotMet(
                          "The source GPU device and current device context do "
                          "not match. The source GPU device number is %d, but "
                          "device context GPU number is %d.",
204 205
                          src_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
206
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
207 208
    memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
209 210 211
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
212 213
    auto src_cuda_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
214
    auto ctx_place = ctx.GetPlace();
215 216 217 218 219 220 221
    PADDLE_ENFORCE_EQ(
        platform::is_gpu_place(ctx_place),
        true,
        platform::errors::PreconditionNotMet(
            "Device context place mismatch. When copying phi::DenseTensor "
            "data from CUDA Pinned memory to GPU memory, current "
            "device context place should be GPU."));
222
    auto ctx_gpu_place = ctx_place;
223 224
    PADDLE_ENFORCE_EQ(dst_gpu_place,
                      ctx_gpu_place,
225 226 227 228
                      platform::errors::PreconditionNotMet(
                          "The target GPU device and current device context do "
                          "not match. The target GPU device number is %d, but "
                          "device context GPU number is %d.",
229 230
                          dst_gpu_place.device,
                          ctx_gpu_place.device));
L
Leo Chen 已提交
231
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
232 233
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
234 235 236
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
237 238
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
Y
Yi Wang 已提交
239
    auto ctx_place = ctx.GetPlace();
240
    PADDLE_ENFORCE_EQ(
241 242
        platform::is_gpu_place(ctx_place),
        true,
243 244 245
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
L
Leo Chen 已提交
246
    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
C
chengduo 已提交
247
    if (platform::is_same_place(src_place, dst_place)) {
248 249
      memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
250 251
    } else {
      if (platform::is_same_place(ctx_place, src_place)) {
252 253
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
254
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
C
chengduo 已提交
255
      } else if (platform::is_same_place(ctx_place, dst_place)) {
C
chengduo 已提交
256
        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
257 258
        memory::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
C
chengduo 已提交
259
      } else {
260 261
        PADDLE_THROW(platform::errors::Unavailable(
            "Context place dose not match the source and destination place."));
C
chengduo 已提交
262 263
      }
    }
264 265
  }
  else {  // NOLINT
266 267
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
Y
Yi Wang 已提交
268 269
  }
#endif
270 271 272
#ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
273 274
    auto src_mlu_place = src_place;
    auto dst_cpu_place = dst_place;
275 276 277 278 279 280
    auto stream =
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
    memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_mlu_place(dst_place)) {
281 282
    auto src_cpu_place = src_place;
    auto dst_mlu_place = dst_place;
283 284 285 286 287 288
    auto stream =
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
    memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
  }
  else if (platform::is_mlu_place(src_place) &&  // NOLINT
           platform::is_mlu_place(dst_place)) {
289 290
    auto src_mlu_place = src_place;
    auto dst_mlu_place = dst_place;
291 292 293 294 295 296 297 298 299
    auto stream =
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
    memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
  }
#endif
A
Allen Guo 已提交
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
#ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copying from %s to %s is not supported.", src_place, dst_place));
  }
#endif
Y
Yi Wang 已提交
323 324
}

325
template <typename TENSOR>
326 327
void TensorCopyImpl(const TENSOR& src,
                    const platform::Place& dst_place,
328
                    TENSOR* dst) {
Y
Yi Wang 已提交
329 330
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
331
  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
332 333
      platform::is_mlu_place(dst_place) ||
      platform::is_custom_place(dst_place)) {
Y
Yi Wang 已提交
334
    dev_ctx = pool.Get(dst_place);
C
chengduo 已提交
335 336
  } else {
    dev_ctx = pool.Get(src.place());
Y
Yi Wang 已提交
337
  }
338 339 340
  TensorCopyImpl(src, dst_place, *dev_ctx, dst);
}

341
void TensorCopy(const phi::DenseTensor& src,
342
                const platform::Place& dst_place,
343 344
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, dst);
345
}
346
void TensorCopy(const phi::DenseTensor& src,
347 348
                const platform::Place& dst_place,
                const platform::DeviceContext& ctx,
349 350
                phi::DenseTensor* dst) {
  TensorCopyImpl<phi::DenseTensor>(src, dst_place, ctx, dst);
351
}
Y
Yi Wang 已提交
352

353
void TensorCopySync(const phi::DenseTensor& src,
354
                    const platform::Place& dst_place,
355
                    phi::DenseTensor* dst) {
356 357 358 359 360 361
  if (&src == dst) {
    auto src_copy = src;
    TensorCopySync(src_copy, dst_place, dst);
    return;
  }

M
minqiyang 已提交
362 363
  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
          << " to " << dst_place;
F
fengjiayi 已提交
364 365 366
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
J
Jacek Czaja 已提交
367
#ifdef PADDLE_WITH_MKLDNN
368
  if (src.layout() == DataLayout::ONEDNN) {
369 370
    dst->set_mem_desc(src.mem_desc());
  }
J
Jacek Czaja 已提交
371
#endif
F
fengjiayi 已提交
372
  auto src_place = src.place();
373
  auto src_ptr = src.data();
374
  auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
375
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
376 377 378 379 380 381 382

  if (src_ptr == dst_ptr && src_place == dst_place) {
    VLOG(3) << "Skip copy the same data from " << src_place << " to "
            << dst_place;
    return;
  }

383
  auto size = src.numel() * phi::SizeOf(src.dtype());
F
fengjiayi 已提交
384
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
385
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
F
fengjiayi 已提交
386
  }
387 388 389 390
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {     /* custom_device -> cpu*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
391
  }                                                // NOLINT
392 393 394
  else if (platform::is_cpu_place(src_place) &&    // NOLINT
           platform::is_custom_place(dst_place)) { /* cpu -> custom_device*/
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
A
Allen Guo 已提交
395
  }                                                 // NOLINT
396 397 398 399 400 401 402 403 404 405 406
  else if (platform::is_custom_place(src_place) &&  // NOLINT
           platform::is_custom_place(
               dst_place)) { /* custom_device -> custom_device*/
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  }
#endif
407 408 409
#ifdef PADDLE_WITH_XPU
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
410
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
411
  }                                              // NOLINT
J
jianghaicheng 已提交
412 413
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
414
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
A
Allen Guo 已提交
415
  }                                              // NOLINT
J
jianghaicheng 已提交
416 417
  else if (platform::is_xpu_place(src_place) &&  // NOLINT
           platform::is_xpu_place(dst_place)) {
418 419 420 421 422
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
423 424 425
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
    platform::XPUPlace xpu_dst_place = dst_place;
    platform::XPUPlace xpu_src_place = src_place;
426 427 428 429
    if (xpu_dst_place.device == xpu_src_place.device) {
      auto xpu_ctx = platform::DeviceContextPool::Instance().Get(xpu_dst_place);
      xpu_ctx->Wait();
    }
A
Allen Guo 已提交
430
  }       // NOLINT
J
jianghaicheng 已提交
431
  else {  // NOLINT
432 433 434 435
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
436
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
437 438
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
439
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
440
  }
441
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
F
fengjiayi 已提交
442
           platform::is_cpu_place(dst_place)) {
443
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
444 445 446
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
447
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
448 449 450
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
451
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
452 453 454
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
455 456
    auto src_gpu_place = src_place;
    auto dst_cpu_place = dst_place;
F
fengjiayi 已提交
457
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
458 459 460
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
461 462
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
463
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
464 465 466
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
467 468
    auto src_gpu_place = src_place;
    auto dst_gpu_place = dst_place;
F
fengjiayi 已提交
469
    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
470 471 472
  }
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_gpu_place(dst_place)) {
473 474
    auto src_pinned_place = src_place;
    auto dst_gpu_place = dst_place;
475 476
    memory::Copy(
        dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr);
477 478
  }
  else {  // NOLINT
479 480
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
F
fengjiayi 已提交
481 482
  }
#endif
483 484 485
#ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
486
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
487 488 489
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_mlu_place(dst_place)) {
490
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
491 492 493 494 495 496 497 498
  }
  else if (platform::is_mlu_place(src_place) &&  // NOLINT
           platform::is_mlu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
              << dst_place;
      return;
    }
499
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
500 501 502 503 504 505
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
A
Allen Guo 已提交
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
#ifdef PADDLE_WITH_IPU
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_cpu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else if (platform::is_ipu_place(src_place) &&  // NOLINT
           platform::is_ipu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
              << dst_place;
      return;
    }
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
  else {  // NOLINT
    PADDLE_THROW(platform::errors::Unimplemented(
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
#endif
F
fengjiayi 已提交
529 530
}

531
void TensorToStream(std::ostream& os,
532
                    const phi::DenseTensor& tensor,
Y
Yi Wang 已提交
533 534 535 536 537 538 539 540 541
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
  }
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
    proto::VarType::TensorDesc desc;
542
    desc.set_data_type(framework::TransToProtoVarType(tensor.dtype()));
543
    auto dims = phi::vectorize(tensor.dims());
Y
Yi Wang 已提交
544 545 546 547 548 549 550 551 552
    auto* pb_dims = desc.mutable_dims();
    pb_dims->Resize(static_cast<int>(dims.size()), 0);
    std::copy(dims.begin(), dims.end(), pb_dims->begin());
    int32_t size = desc.ByteSize();
    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
    auto out = desc.SerializeAsString();
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
553
    uint64_t size = tensor.numel() * phi::SizeOf(tensor.dtype());
Y
yuyang18 已提交
554

555
    auto* data_ptr = tensor.data();
556 557
    PADDLE_ENFORCE_LT(size,
                      (std::numeric_limits<std::streamsize>::max)(),
T
tangwei12 已提交
558 559
                      platform::errors::ResourceExhausted(
                          "tensor size %d overflow when writing tensor", size));
Y
Yi Wang 已提交
560
    if (platform::is_gpu_place(tensor.place())) {
561
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yi Wang 已提交
562 563
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
L
Leo Chen 已提交
564
      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
Y
Yi Wang 已提交
565 566 567 568
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
569 570 571 572 573
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
Y
Yi Wang 已提交
574 575 576 577 578 579 580
                     gpu_dev_ctx.stream());
        gpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
T
tangwei12 已提交
581 582
      PADDLE_THROW(platform::errors::Unimplemented(
          "CUDAPlace is not supported when not compiled with CUDA"));
583 584 585 586 587 588 589 590 591 592 593
#endif
    } else if (platform::is_xpu_place(tensor.place())) {
#ifdef PADDLE_WITH_XPU
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& xpu_dev_ctx =
          static_cast<const platform::XPUDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
594 595 596 597 598
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write);
599 600 601 602 603 604 605 606
        xpu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "XPUPlace is not supported when not compiled with XPU"));
607 608 609 610 611 612 613 614 615 616 617
#endif
    } else if (platform::is_mlu_place(tensor.place())) {
#ifdef PADDLE_WITH_MLU
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& mlu_dev_ctx =
          static_cast<const platform::MLUDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
618 619 620 621 622
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
623 624 625 626 627 628 629 630 631
                     mlu_dev_ctx.stream());
        mlu_dev_ctx.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "MLUPlace is not supported when not compiled with MLU"));
632 633 634 635 636 637 638 639 640 641 642
#endif
    } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& custom_device_context =
          static_cast<const platform::CustomDeviceContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
643 644 645 646 647
        memory::Copy(cpu,
                     buf.get(),
                     tensor.place(),
                     reinterpret_cast<const void*>(data),
                     size_to_write,
648 649 650 651 652 653 654 655 656 657
                     custom_device_context.stream());
        custom_device_context.Wait();
        os.write(buf.get(), size_to_write);
        data += size_to_write;
        size -= size_to_write;
      }
#else
      PADDLE_THROW(platform::errors::Unimplemented(
          "CustomPlace is not supported when not compiled with "
          "CustomDevice"));
Y
Yi Wang 已提交
658 659 660 661 662 663 664 665 666
#endif
    } else {
      os.write(static_cast<const char*>(data_ptr),
               static_cast<std::streamsize>(size));
    }
  }
}

struct DeserializedDataFunctor {
667
  DeserializedDataFunctor(void** buf,
668
                          phi::DenseTensor* tensor,
Y
Yi Wang 已提交
669 670 671 672
                          const platform::Place& place)
      : buf_(buf), tensor_(tensor), place_(place) {}

  template <typename T>
D
dzhwinter 已提交
673
  void apply() {
Y
Yi Wang 已提交
674 675 676 677
    *buf_ = tensor_->mutable_data<T>(place_);
  }

  void** buf_;
678
  phi::DenseTensor* tensor_;
Y
Yi Wang 已提交
679 680 681
  platform::Place place_;
};

682
void TensorFromStream(std::istream& is,
683
                      phi::DenseTensor* tensor,
T
tangwei12 已提交
684
                      const platform::DeviceContext& dev_ctx,
685 686
                      const size_t& seek,
                      const std::vector<int64_t>& shape) {
T
tangwei12 已提交
687 688 689 690
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));

  PADDLE_ENFORCE_EQ(
691 692
      version,
      0U,
T
tangwei12 已提交
693 694 695 696 697 698 699 700 701 702 703 704
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));

  proto::VarType::TensorDesc desc;
  {  // int32_t size
    // proto buffer
    int32_t size;
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
    PADDLE_ENFORCE_EQ(
705 706
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
707 708 709
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
  }
  {  // read tensor
710
    tensor->Resize(phi::make_ddim(shape));
T
tangwei12 已提交
711 712 713 714
    size_t seekg = seek * framework::SizeOfType(desc.data_type());
    is.seekg(seekg, is.cur);

    void* buf;
L
Leo Chen 已提交
715
    phi::CPUContext ctx;
T
tangwei12 已提交
716
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
717
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
718
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
719
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
720 721
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
722
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
723
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
724
    defined(PADDLE_WITH_CUSTOM_DEVICE)
725
      phi::DenseTensor cpu_tensor;
726
      cpu_tensor.Resize(phi::make_ddim(shape));
T
tangwei12 已提交
727 728 729 730 731 732
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
733 734
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
735 736
        dev_ctx.Wait();
      }
T
tangwei12 已提交
737
#else
738 739 740
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
741
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
742 743
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
744 745 746
      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "MLUPlace is not supported when not compiled with MLU"));
747 748 749
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));
750
      }
T
tangwei12 已提交
751 752 753 754 755 756 757 758 759 760
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
      is.read(static_cast<char*>(buf), size);
    }
  }
}

761
void TensorFromStream(std::istream& is,
762
                      phi::DenseTensor* tensor,
Y
Yi Wang 已提交
763 764 765
                      const platform::DeviceContext& dev_ctx) {
  uint32_t version;
  is.read(reinterpret_cast<char*>(&version), sizeof(version));
T
tangwei12 已提交
766
  PADDLE_ENFORCE_EQ(
767 768
      version,
      0U,
T
tangwei12 已提交
769 770 771
      platform::errors::InvalidArgument(
          "tensor version %u is not supported, Only version 0 is supported",
          version));
Y
Yi Wang 已提交
772 773 774
  proto::VarType::TensorDesc desc;
  {  // int32_t size
     // proto buffer
Z
zlsh80826 已提交
775
    int32_t size = -1;
Y
Yi Wang 已提交
776
    is.read(reinterpret_cast<char*>(&size), sizeof(size));
777
    PADDLE_ENFORCE_EQ(
778 779
        is.good(),
        true,
780
        platform::errors::Unavailable("Cannot read tensor desc size"));
781 782 783 784
    PADDLE_ENFORCE_GE(size,
                      0,
                      platform::errors::InvalidArgument(
                          "phi::DenseTensor desc size should >= 0"));
Y
Yi Wang 已提交
785 786
    std::unique_ptr<char[]> buf(new char[size]);
    is.read(reinterpret_cast<char*>(buf.get()), size);
T
tangwei12 已提交
787
    PADDLE_ENFORCE_EQ(
788 789
        desc.ParseFromArray(buf.get(), size),
        true,
T
tangwei12 已提交
790
        platform::errors::InvalidArgument("Cannot parse tensor desc"));
Y
Yi Wang 已提交
791 792 793 794 795
  }
  {  // read tensor
    std::vector<int64_t> dims;
    dims.reserve(static_cast<size_t>(desc.dims().size()));
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
796
    tensor->Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
797
    void* buf;
L
Leo Chen 已提交
798
    phi::CPUContext ctx;
Y
Yu Yang 已提交
799
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
800
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
801
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
802
        platform::is_mlu_place(dev_ctx.GetPlace()) ||
803 804
        platform::is_npu_place(dev_ctx.GetPlace()) ||
        platform::is_custom_place(dev_ctx.GetPlace())) {
805
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
806
    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
807
    defined(PADDLE_WITH_CUSTOM_DEVICE)
808
      phi::DenseTensor cpu_tensor;
809
      cpu_tensor.Resize(phi::make_ddim(dims));
Y
Yi Wang 已提交
810 811 812
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
Y
yuyang18 已提交
813
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
814 815
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
816 817
      if (platform::is_npu_place(dev_ctx.GetPlace()) ||
          platform::is_custom_place(dev_ctx.GetPlace())) {
818 819
        dev_ctx.Wait();
      }
Y
Yi Wang 已提交
820
#else
821 822 823
      if (platform::is_gpu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CUDAPlace is not supported when not compiled with CUDA"));
824
      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
825 826
        PADDLE_THROW(platform::errors::Unimplemented(
            "XPUPlace is not supported when not compiled with XPU"));
827 828 829
      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
        PADDLE_THROW(platform::errors::Unimplemented(
            "MLUPlace is not supported when not compiled with MLU"));
830
      } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
831 832
        PADDLE_THROW(platform::errors::Unimplemented(
            "NPUPlace is not supported when not compiled with NPU"));
833 834 835
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "CutomPlace is not supported when not compiled with CustomDevice"));
836
      }
Y
Yi Wang 已提交
837 838 839 840 841
#endif
    } else {
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
Y
yuyang18 已提交
842
      is.read(static_cast<char*>(buf), size);
Y
Yi Wang 已提交
843 844 845 846
    }
  }
}

6
633WHU 已提交
847
// get tensor data point by DLDataType
848
void* GetDstPtrByDLDataType(DLDataType type,
849
                            phi::DenseTensor* dst,
6
633WHU 已提交
850 851
                            const platform::Place& dst_place) {
  // vector types not currently supported
852 853
  PADDLE_ENFORCE_LE(type.lanes,
                    1,
854 855
                    platform::errors::Unimplemented(
                        "Vector type is not supported currently."));
6
633WHU 已提交
856 857 858 859 860 861 862

  switch (type.bits) {
    case 8:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
      if (type.code == kDLUInt)
        return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
863 864
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
865 866
          type.code,
          type.bits));
6
633WHU 已提交
867 868 869 870 871 872
    case 16:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::float16>(dst_place));
S
Siming Dai 已提交
873 874 875
      if (type.code == kDLBfloat)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::bfloat16>(dst_place));
876 877
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
878 879
          type.code,
          type.bits));
6
633WHU 已提交
880 881 882 883 884
    case 32:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<float>(dst_place));
885 886
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
887 888
          type.code,
          type.bits));
6
633WHU 已提交
889 890 891 892 893
    case 64:
      if (type.code == kDLInt)
        return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
      if (type.code == kDLFloat)
        return static_cast<void*>(dst->mutable_data<double>(dst_place));
S
Siming Dai 已提交
894 895 896 897 898
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<float>>(dst_place));
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
899 900
          type.code,
          type.bits));
S
Siming Dai 已提交
901 902 903 904
    case 128:
      if (type.code == kDLComplex)
        return static_cast<void*>(
            dst->mutable_data<paddle::platform::complex<double>>(dst_place));
905 906
      PADDLE_THROW(platform::errors::Unimplemented(
          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
907 908
          type.code,
          type.bits));
6
633WHU 已提交
909
    default:
910 911
      PADDLE_THROW(platform::errors::Unimplemented(
          "Unsupported DLDataType.bits %d.", type.bits));
6
633WHU 已提交
912 913 914
  }
}

915
void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
6
633WHU 已提交
916 917 918 919
  platform::CPUPlace dst_place = platform::CPUPlace();
  platform::CPUPlace src_place = platform::CPUPlace();

  std::vector<int64_t> vec;
920 921
  std::copy(dl_tensor.shape,
            dl_tensor.shape + dl_tensor.ndim,
6
633WHU 已提交
922 923
            std::back_inserter(vec));

924
  framework::DDim vddim = phi::make_ddim(vec);
6
633WHU 已提交
925 926 927 928 929 930

  dst->Resize(vddim);
  ::DLDataType type = dl_tensor.dtype;
  void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);

  auto src_ptr = static_cast<const void*>(dl_tensor.data);
931
  auto size = phi::product(vddim) * type.bits / 8;
6
633WHU 已提交
932

S
Siming Dai 已提交
933
  if (dl_tensor.device.device_type == kDLCPU) {
6
633WHU 已提交
934 935
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
936
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
S
Siming Dai 已提交
937
  if (dl_tensor.device.device_type == kDLGPU) {
6
633WHU 已提交
938
    platform::CUDAPlace dst_place =
S
Siming Dai 已提交
939
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
940
    platform::CUDAPlace src_place =
S
Siming Dai 已提交
941
        platform::CUDAPlace(dl_tensor.device.device_id);
6
633WHU 已提交
942 943
    dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
L
Leo Chen 已提交
944 945 946 947 948 949
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
6
633WHU 已提交
950 951
  }
#endif
952 953 954
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
6
633WHU 已提交
955 956
}

S
Siming Dai 已提交
957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998
void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
  std::vector<int64_t> vec;
  std::copy(src->dl_tensor.shape,
            src->dl_tensor.shape + src->dl_tensor.ndim,
            std::back_inserter(vec));

  framework::DDim vddim = phi::make_ddim(vec);
  dst->Resize(vddim);
  ::DLDataType type = src->dl_tensor.dtype;

  auto src_ptr = static_cast<const void*>(src->dl_tensor.data);
  auto size = phi::product(vddim) * type.bits / 8;

  if (src->dl_tensor.device.device_type == kDLCPU) {
    platform::CPUPlace dst_place = platform::CPUPlace();
    platform::CPUPlace src_place = platform::CPUPlace();
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (src->dl_tensor.device.device_type == kDLGPU) {
    platform::CUDAPlace dst_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    platform::CUDAPlace src_place =
        platform::CUDAPlace(src->dl_tensor.device.device_id);
    void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
    // Fix copy by share allocation.
    memory::Copy(dst_place,
                 dst_ptr,
                 src_place,
                 src_ptr,
                 size,
                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
  }
#endif
  src->deleter(const_cast<DLManagedTensor*>(src));
#ifdef PADDLE_WITH_XPU
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
#endif
}

999
template <typename T>
1000
std::string format_tensor(const phi::DenseTensor& tensor) {
1001 1002 1003 1004
  // TODO(zhiqiu): use the print option to format tensor.
  return "NOT IMPLEMENTED";
}

1005
template <typename T>
1006
std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
1007 1008 1009
  auto inspect = tensor.data<T>();
  auto element_num = tensor.numel();

1010
  os << "  - data: [";
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024
  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
    if (element_num > 0) {
      os << signed(inspect[0]);
      for (int j = 1; j < element_num; ++j) {
        os << " " << signed(inspect[j]);
      }
    }
  } else {
    if (element_num > 0) {
      os << inspect[0];
      for (int j = 1; j < element_num; ++j) {
        os << " " << inspect[j];
      }
1025 1026 1027 1028 1029 1030
    }
  }
  os << "]";
  return os;
}

1031
template <>
1032
std::ostream& print_tensor<paddle::platform::complex<float>>(
1033
    std::ostream& os, const phi::DenseTensor& tensor) {
1034
  auto inspect = tensor.data<paddle::platform::complex<float>>();
1035 1036 1037 1038
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
1039
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
1040
    for (int j = 1; j < element_num; ++j) {
1041 1042
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
1043 1044 1045 1046 1047 1048 1049
    }
  }
  os << "]";
  return os;
}

template <>
1050
std::ostream& print_tensor<paddle::platform::complex<double>>(
1051
    std::ostream& os, const phi::DenseTensor& tensor) {
1052
  auto inspect = tensor.data<paddle::platform::complex<double>>();
1053 1054 1055 1056
  auto element_num = tensor.numel();

  os << "  - data: [";
  if (element_num > 0) {
1057
    os << signed(inspect[0].real) << "+" << signed(inspect[0].imag) << "j";
1058
    for (int j = 1; j < element_num; ++j) {
1059 1060
      os << " " << signed(inspect[j].real) << "+" << signed(inspect[j].imag)
         << "j";
1061 1062 1063 1064 1065 1066
    }
  }
  os << "]";
  return os;
}

1067
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
1068 1069
  // NOTE(xiongkun):
  // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution
1070
  // if we don't redefine, the operator << of phi / framework LoD is not found.
1071
  paddle::string::operator<<(os, lod);
1072 1073 1074
  return os;
}

1075 1076 1077
}  // namespace framework
}  // namespace paddle

1078
namespace phi {
1079

1080 1081 1082 1083 1084
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  paddle::string::operator<<(os, lod);
  return os;
}

1085
std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
1086 1087 1088 1089
  if (t.lod().size() > 0) {
    os << "  - lod: " << t.lod() << "\n";
  }

1090 1091
  os << "  - place: " << t.place() << "\n";
  os << "  - shape: [" << t.dims() << "]\n";
1092
  os << "  - layout: " << phi::DataLayoutToString(t.layout()) << "\n";
1093

1094
  DenseTensor tensor;
1095
  tensor.Resize(t.dims());
1096
  if (paddle::platform::is_cpu_place(t.place())) {
1097 1098
    tensor.ShareDataWith(t);
  } else {
1099 1100 1101 1102
    paddle::platform::CPUPlace place;
    paddle::framework::TensorCopy(t, place, &tensor);
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
1103 1104 1105 1106
    auto& dev_ctx = *pool.Get(t.place());
    dev_ctx.Wait();
  }

1107 1108 1109 1110 1111 1112 1113 1114
#define PrintTensorCallback(cpp_type, proto_type)                 \
  do {                                                            \
    if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
        proto_type) {                                             \
      os << "  - dtype: " << proto_type << "\n";                  \
      paddle::framework::print_tensor<cpp_type>(os, tensor);      \
      return os;                                                  \
    }                                                             \
1115 1116 1117 1118 1119 1120
  } while (0)

  _ForEachDataType_(PrintTensorCallback);
  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
  return os;
}
1121
}  // namespace phi