zero_copy_tensor.cc 37.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle/fluid/framework/convert_utils.h"
16
#include "paddle/fluid/framework/data_layout_transform.h"
17 18
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
19
#include "paddle/fluid/framework/string_array.h"
20
#include "paddle/fluid/inference/api/paddle_inference_api.h"
W
Wilber 已提交
21
#include "paddle/fluid/inference/api/paddle_tensor.h"
N
nhzlx 已提交
22
#include "paddle/fluid/memory/memcpy.h"
23
#include "paddle/fluid/platform/enforce.h"
24
#include "paddle/fluid/platform/float16.h"
25
#include "paddle/phi/core/allocator.h"
26
#ifdef PADDLE_WITH_ONNXRUNTIME
H
heliqi 已提交
27 28
#include "onnxruntime_c_api.h"    // NOLINT
#include "onnxruntime_cxx_api.h"  // NOLINT
29
#endif
30

31
namespace paddle_infer {
32

33 34
using float16 = paddle::platform::float16;

35
void Tensor::Reshape(const std::vector<int> &shape) {
36 37 38 39 40 41 42
#ifdef PADDLE_WITH_ONNXRUNTIME
  if (is_ort_tensor_) {
    shape_.assign(shape.begin(), shape.end());
    return;
  }
#endif

W
Wilber 已提交
43
  PADDLE_ENFORCE_EQ(
44 45
      name_.empty(),
      false,
46
      paddle::platform::errors::PreconditionNotMet(
W
Wilber 已提交
47 48
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
49 50
  PADDLE_ENFORCE_EQ(input_or_output_,
                    true,
51
                    paddle::platform::errors::PermissionDenied(
W
Wilber 已提交
52
                        "Can't reshape the output tensor, it is readonly"));
53
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
54
  auto *var = scope->FindVar(name_);
W
Wilber 已提交
55
  PADDLE_ENFORCE_NOT_NULL(
56 57 58
      var,
      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
59
  auto *tensor = var->GetMutable<phi::DenseTensor>();
60
  tensor->Resize(phi::make_ddim(shape));
61 62
}

S
Steffy-zxf 已提交
63 64
void Tensor::ReshapeStrings(const size_t &shape) {
  PADDLE_ENFORCE_EQ(
65 66
      name_.empty(),
      false,
S
Steffy-zxf 已提交
67 68 69
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
70 71
  PADDLE_ENFORCE_EQ(input_or_output_,
                    true,
S
Steffy-zxf 已提交
72 73 74 75 76
                    paddle::platform::errors::PermissionDenied(
                        "Can't reshape the output tensor, it is readonly"));
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
77 78 79
      var,
      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
80 81
  paddle::framework::Strings *tensor =
      var->GetMutable<paddle::framework::Strings>();
S
Steffy-zxf 已提交
82 83 84 85 86 87 88 89
  tensor->resize(shape);
}

#define EAGER_GET_TENSOR(tensor_type)    \
  if (!tensor_) {                        \
    tensor_ = FindTensor<tensor_type>(); \
  }                                      \
  auto *tensor = static_cast<tensor_type *>(tensor_);
90

91
template <typename T>
92
T *Tensor::mutable_data(PlaceType place) {
93 94 95 96 97
#ifdef PADDLE_WITH_ONNXRUNTIME
  if (is_ort_tensor_) {
    return ORTGetMutableData<T>();
  }
#endif
98
  EAGER_GET_TENSOR(phi::DenseTensor);
99
  PADDLE_ENFORCE_GT(
100 101
      tensor->numel(),
      0,
102 103
      paddle::platform::errors::PreconditionNotMet(
          "You should call Tensor::Reshape(const std::vector<int> "
W
Wilber 已提交
104 105
          "&shape)"
          "function before retrieving mutable_data from input tensor."));
106
  switch (static_cast<int>(place)) {
107 108
    case static_cast<int>(PlaceType::kCPU): {
      return tensor->mutable_data<T>(paddle::platform::CPUPlace());
109
    }
110
    case static_cast<int>(PlaceType::kGPU): {
111 112 113
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      paddle::platform::CUDAPlace gpu_place(device_);
      auto *dev_ctxs = reinterpret_cast<const std::map<
114 115 116
          phi::Place,
          std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
          device_contexs_);
117 118 119 120
      auto *dev_ctx =
          static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
      return dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
#else
121
      return tensor->mutable_data<T>(paddle::platform::CUDAPlace(device_));
122
#endif
123 124 125
    }
    case static_cast<int>(PlaceType::kXPU): {
      return tensor->mutable_data<T>(paddle::platform::XPUPlace(device_));
126
    }
127 128 129
    case static_cast<int>(PlaceType::kNPU): {
      return tensor->mutable_data<T>(paddle::platform::NPUPlace(device_));
    }
130 131 132 133
    case static_cast<int>(PlaceType::kCUSTOM): {
      return tensor->mutable_data<T>(
          paddle::platform::CustomPlace(device_type_, device_));
    }
134
    default:
135
      PADDLE_THROW(paddle::platform::errors::Unavailable(
136 137
          "Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is "
          "not supported.",
138
          static_cast<int>(place)));
139 140 141 142 143 144
      break;
  }
  return nullptr;
}

template <typename T>
145
T *Tensor::data(PlaceType *place, int *size) const {
146
  EAGER_GET_TENSOR(phi::DenseTensor);
147 148
  auto *res = tensor->data<T>();

149 150 151 152 153 154
  if (paddle::platform::is_cpu_place(tensor->place())) {
    *place = PlaceType::kCPU;
  } else if (paddle::platform::is_gpu_place(tensor->place())) {
    *place = PlaceType::kGPU;
  } else if (paddle::platform::is_xpu_place(tensor->place())) {
    *place = PlaceType::kXPU;
155 156
  } else if (paddle::platform::is_npu_place(tensor->place())) {
    *place = PlaceType::kNPU;
157 158
  } else if (paddle::platform::is_custom_place(tensor->place())) {
    *place = PlaceType::kCUSTOM;
159
  } else {
160
    *place = PlaceType::kUNK;
161 162 163 164 165 166
  }

  *size = tensor->numel();
  return res;
}

167
DataType Tensor::type() const {
168 169 170 171 172
#ifdef PADDLE_WITH_ONNXRUNTIME
  if (is_ort_tensor_) {
    return dtype_;
  }
#endif
173
  EAGER_GET_TENSOR(phi::DenseTensor);
174
  auto type = paddle::framework::TransToProtoVarType(tensor->dtype());
175 176 177
  if (type == paddle::framework::proto::VarType::FP64) {
    return DataType::FLOAT64;
  } else if (type == paddle::framework::proto::VarType::FP32) {
178
    return DataType::FLOAT32;
179 180
  } else if (type == paddle::framework::proto::VarType::FP16) {
    return DataType::FLOAT16;
181 182 183 184 185 186
  } else if (type == paddle::framework::proto::VarType::INT64) {
    return DataType::INT64;
  } else if (type == paddle::framework::proto::VarType::INT32) {
    return DataType::INT32;
  } else if (type == paddle::framework::proto::VarType::UINT8) {
    return DataType::UINT8;
187 188
  } else if (type == paddle::framework::proto::VarType::INT8) {
    return DataType::INT8;
189 190
  } else if (type == paddle::framework::proto::VarType::BOOL) {
    return DataType::BOOL;
191
  }
192
  return DataType::FLOAT32;
193 194
}

195 196
PlaceType Tensor::place() const { return place_; }

N
nhzlx 已提交
197
template <typename T>
198
void Tensor::CopyFromCpu(const T *data) {
199
  EAGER_GET_TENSOR(phi::DenseTensor);
200 201
  PADDLE_ENFORCE_GE(tensor->numel(),
                    0,
202 203
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
W
Wilber 已提交
204 205
                        "std::vector<int> &shape)"
                        "function before copying data from cpu."));
N
nhzlx 已提交
206 207
  size_t ele_size = tensor->numel() * sizeof(T);

208 209
  if (place_ == PlaceType::kCPU) {
    auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
N
nhzlx 已提交
210
    std::memcpy(static_cast<void *>(t_data), data, ele_size);
211
  } else if (place_ == PlaceType::kGPU) {
212
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
213

214
    paddle::platform::CUDAPlace gpu_place(device_);
215
    auto *dev_ctxs = reinterpret_cast<const std::map<
216 217
        phi::Place,
        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
218 219 220 221
        device_contexs_);
    auto *dev_ctx =
        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
    auto *t_data = dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
N
nhzlx 已提交
222

223 224 225 226 227
    paddle::memory::Copy(gpu_place,
                         static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(),
                         data,
                         ele_size,
228
                         dev_ctx->stream());
N
nhzlx 已提交
229
#else
230 231 232
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
N
nhzlx 已提交
233
#endif
234
  } else if (place_ == PlaceType::kXPU) {
235
#ifdef PADDLE_WITH_XPU
236
    paddle::platform::XPUPlace xpu_place(device_);
237
    auto *t_data = tensor->mutable_data<T>(xpu_place);
238 239 240 241 242
    paddle::memory::Copy(xpu_place,
                         static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(),
                         data,
                         ele_size);
243
#else
244 245 246
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with XPU place because paddle is not compiled "
        "with XPU."));
W
Wilber 已提交
247 248 249 250 251 252 253 254 255
#endif
  } else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    paddle::platform::NPUPlace npu_place(device_);
    auto *t_data = tensor->mutable_data<T>(npu_place);
    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
        pool.Get(npu_place));
256 257 258 259 260
    paddle::memory::Copy(npu_place,
                         static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(),
                         data,
                         ele_size,
W
Wilber 已提交
261 262 263 264 265
                         dev_ctx->stream());
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with NPU place because paddle is not compiled "
        "with NPU."));
266 267
#endif
  } else {
268 269 270 271 272 273
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_type_id =
        static_cast<size_t>(place_) - static_cast<size_t>(PlaceType::kCUSTOM);
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    paddle::platform::CustomPlace custom_place(
274 275 276
        phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
            device_type_id),
        device_);
277 278 279
    auto *t_data = tensor->mutable_data<T>(custom_place);
    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
        pool.Get(custom_place));
280 281 282 283 284
    paddle::memory::Copy(custom_place,
                         static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(),
                         data,
                         ele_size,
285 286
                         dev_ctx->stream());
#else
287
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
288
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
289
#endif
N
nhzlx 已提交
290 291 292
  }
}

293 294 295
template <typename T>
struct DataTypeInfo;

296
template <>
297 298
struct DataTypeInfo<double> {
  phi::DataType TYPE = phi::DataType::FLOAT64;
299 300
};

301 302
template <>
struct DataTypeInfo<float> {
303
  phi::DataType TYPE = phi::DataType::FLOAT32;
304 305 306 307
};

template <>
struct DataTypeInfo<float16> {
308
  phi::DataType TYPE = phi::DataType::FLOAT16;
309 310 311 312
};

template <>
struct DataTypeInfo<int64_t> {
313
  phi::DataType TYPE = phi::DataType::INT64;
314 315 316 317
};

template <>
struct DataTypeInfo<int8_t> {
318
  phi::DataType TYPE = phi::DataType::INT8;
319 320 321 322
};

template <>
struct DataTypeInfo<uint8_t> {
323
  phi::DataType TYPE = phi::DataType::UINT8;
324 325 326 327
};

template <>
struct DataTypeInfo<int32_t> {
328
  phi::DataType TYPE = phi::DataType::INT32;
329 330
};

331 332 333 334 335
template <>
struct DataTypeInfo<bool> {
  phi::DataType TYPE = phi::DataType::BOOL;
};

336
phi::DataLayout LayoutConvert(DataLayout layout) {
337
  PADDLE_ENFORCE_EQ(
338 339
      layout,
      DataLayout::kNCHW,
340
      paddle::platform::errors::InvalidArgument("Only NCHW is supported now."));
341
  return phi::DataLayout::NCHW;
342 343 344
}

template <typename T>
345 346 347 348
void Tensor::ShareExternalData(const T *data,
                               const std::vector<int> &shape,
                               PlaceType place,
                               DataLayout layout) {
349
  EAGER_GET_TENSOR(phi::DenseTensor)
350 351 352
  size_t size =
      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
      sizeof(T);
353 354
  phi::DenseTensorMeta meta(
      DataTypeInfo<T>().TYPE, phi::make_ddim(shape), LayoutConvert(layout));
355 356
  if (place == PlaceType::kCPU) {
    phi::DenseTensor dtensor(
357 358
        std::make_shared<phi::Allocation>(
            const_cast<T *>(data), size, paddle::platform::CPUPlace()),
359 360 361 362
        meta);
    *tensor = std::move(dtensor);
  } else if (place == PlaceType::kGPU) {
    phi::DenseTensor dtensor(
363 364
        std::make_shared<phi::Allocation>(
            const_cast<T *>(data), size, paddle::platform::CUDAPlace(device_)),
365 366 367 368 369 370 371 372
        meta);
    *tensor = std::move(dtensor);
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "PlaceType must be PlaceType::kCPU or PlaceType::kGPU."));
  }
}

S
Steffy-zxf 已提交
373
void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) {
374
  EAGER_GET_TENSOR(paddle::framework::Strings);
375 376
  PADDLE_ENFORCE_GE(tensor->size(),
                    0,
S
Steffy-zxf 已提交
377 378 379 380 381 382 383
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::size_t &shape)function before copying"
                        "the string data from cpu."));
  *tensor = *data;
}

N
nhzlx 已提交
384
template <typename T>
385 386 387
void Tensor::CopyToCpuImpl(T *data,
                           void *exec_stream,
                           CallbackFunc cb,
388
                           void *cb_params) const {
389
  EAGER_GET_TENSOR(phi::DenseTensor);
N
nhzlx 已提交
390 391 392 393
  auto ele_num = tensor->numel();
  auto *t_data = tensor->data<T>();
  auto t_place = tensor->place();

394
  if (paddle::platform::is_cpu_place(t_place)) {
395
#ifdef PADDLE_WITH_MKLDNN
396 397 398 399 400 401 402 403
    if (tensor->layout() == phi::DataLayout::ONEDNN) {
      phi::DenseTensor out;
      auto mem_allocation =
          std::make_shared<paddle::memory::allocation::Allocation>(
              static_cast<void *>(data),
              ele_num * sizeof(T),
              paddle::platform::CPUPlace());
      out.ResetHolder(mem_allocation);
404
      phi::funcs::TransDataLayoutFromOneDNN(
405
          tensor->layout(),
406
          phi::OneDNNContext::tls().get_cur_paddle_data_layout(),
407 408 409 410
          *tensor,
          &out,
          paddle::platform::CPUPlace(),
          true);
411
    } else {
412
      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
413
    }
414
#else
N
nhzlx 已提交
415
    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
J
jianghaicheng 已提交
416 417 418 419 420 421 422 423
#endif
  } else if (paddle::platform::is_ipu_place(t_place)) {
#ifdef PADDLE_WITH_IPU
    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with IPU place because paddle is not compiled "
        "with IPU."));
424
#endif
425
  } else if (place_ == PlaceType::kGPU) {
426
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
427
    auto gpu_place = t_place;
428
    auto *dev_ctxs = reinterpret_cast<const std::map<
429 430
        phi::Place,
        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
431 432 433
        device_contexs_);
    auto *dev_ctx =
        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
434
    paddle::memory::Copy(paddle::platform::CPUPlace(),
435 436 437 438 439
                         static_cast<void *>(data),
                         gpu_place,
                         t_data,
                         ele_num * sizeof(T),
                         dev_ctx->stream());
440 441 442
#ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
#else
443 444 445 446 447 448 449 450 451 452
    // async, return stream
    if (nullptr != exec_stream) {
      *(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
      // async with callback
    } else if (cb) {
      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
      // sync
    } else {
      cudaStreamSynchronize(dev_ctx->stream());
    }
453
#endif
N
nhzlx 已提交
454
#else
455 456 457
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
N
nhzlx 已提交
458
#endif
459
  } else if (place_ == PlaceType::kXPU) {
460
#ifdef PADDLE_WITH_XPU
461
    auto xpu_place = t_place;
462
    paddle::memory::Copy(paddle::platform::CPUPlace(),
463 464 465
                         static_cast<void *>(data),
                         xpu_place,
                         t_data,
466
                         ele_num * sizeof(T));
467
#else
468 469 470
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with XPU place because paddle is not compiled "
        "with XPU."));
W
Wilber 已提交
471 472 473 474 475
#endif
  } else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
476
    auto npu_place = t_place;
W
Wilber 已提交
477 478 479
    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
        pool.Get(npu_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
480 481 482 483 484
                         static_cast<void *>(data),
                         npu_place,
                         t_data,
                         ele_num * sizeof(T),
                         dev_ctx->stream());
485
    paddle::platform::NPUStreamSync(dev_ctx->stream());
W
Wilber 已提交
486 487 488 489
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with NPU place because paddle is not compiled "
        "with NPU."));
490 491
#endif
  } else {
492 493 494 495 496 497 498
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    auto custom_place = t_place;
    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
        pool.Get(custom_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
499 500 501 502 503
                         static_cast<void *>(data),
                         custom_place,
                         t_data,
                         ele_num * sizeof(T),
                         dev_ctx->stream());
504 505
// TODO(wangran16): sync_stream
#else
506
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
W
Wilber 已提交
507
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
508
#endif
N
nhzlx 已提交
509 510
  }
}
511 512 513

template <typename T>
void Tensor::CopyToCpu(T *data) const {
514 515 516 517 518 519 520
#ifdef PADDLE_WITH_ONNXRUNTIME
  if (is_ort_tensor_) {
    ORTCopyToCpu<T>(data);
    return;
  }
#endif

521 522 523 524 525 526 527 528 529 530 531 532 533
  CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
  CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
  CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
}

534
template PD_INFER_DECL void Tensor::CopyFromCpu<double>(const double *data);
535 536 537 538 539
template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
540
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);
541
template PD_INFER_DECL void Tensor::CopyFromCpu<bool>(const bool *data);
542

543 544 545 546 547
template PD_INFER_DECL void Tensor::ShareExternalData<double>(
    const double *data,
    const std::vector<int> &shape,
    PlaceType place,
    DataLayout layout);
548
template PD_INFER_DECL void Tensor::ShareExternalData<float>(
549 550 551
    const float *data,
    const std::vector<int> &shape,
    PlaceType place,
552 553
    DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int64_t>(
554 555 556
    const int64_t *data,
    const std::vector<int> &shape,
    PlaceType place,
557 558
    DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int32_t>(
559 560 561
    const int32_t *data,
    const std::vector<int> &shape,
    PlaceType place,
562 563
    DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<uint8_t>(
564 565 566
    const uint8_t *data,
    const std::vector<int> &shape,
    PlaceType place,
567 568
    DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<int8_t>(
569 570 571
    const int8_t *data,
    const std::vector<int> &shape,
    PlaceType place,
572 573
    DataLayout layout);
template PD_INFER_DECL void Tensor::ShareExternalData<float16>(
574 575 576
    const float16 *data,
    const std::vector<int> &shape,
    PlaceType place,
577
    DataLayout layout);
578 579 580 581 582
template PD_INFER_DECL void Tensor::ShareExternalData<bool>(
    const bool *data,
    const std::vector<int> &shape,
    PlaceType place,
    DataLayout layout);
583

584
template PD_INFER_DECL void Tensor::CopyToCpu<double>(double *data) const;
585 586 587 588 589 590
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;
591
template PD_INFER_DECL void Tensor::CopyToCpu<bool>(bool *data) const;
592

593 594
template PD_INFER_DECL void Tensor::CopyToCpuImpl<double>(
    double *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
595 596 597 598 599 600 601 602 603 604 605 606 607 608
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float>(float *data,
                                                         void *exec_stream,
                                                         CallbackFunc cb,
                                                         void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int64_t>(
    int64_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int32_t>(
    int32_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<uint8_t>(
    uint8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
    int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
    float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
609 610 611 612
template PD_INFER_DECL void Tensor::CopyToCpuImpl<bool>(bool *data,
                                                        void *exec_stream,
                                                        CallbackFunc cb,
                                                        void *cb_params) const;
613

614 615
template PD_INFER_DECL void Tensor::CopyToCpuAsync<double>(
    double *data, void *exec_stream) const;
616 617 618 619 620 621 622 623 624 625 626 627
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
    float *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
    int64_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
    int32_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
    uint8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
    int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
    float16 *data, void *exec_stream) const;
628 629
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(
    bool *data, void *exec_stream) const;
630

631 632
template PD_INFER_DECL void Tensor::CopyToCpuAsync<double>(
    double *data, CallbackFunc cb, void *cb_params) const;
633 634 635 636 637 638 639 640 641 642 643 644
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
    float *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
    int64_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
    int32_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
    uint8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
    int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
    float16 *data, CallbackFunc cb, void *cb_params) const;
645 646 647
template PD_INFER_DECL void Tensor::CopyToCpuAsync<bool>(bool *data,
                                                         CallbackFunc cb,
                                                         void *cb_params) const;
648

649 650
template PD_INFER_DECL double *Tensor::data<double>(PlaceType *place,
                                                    int *size) const;
651 652 653 654 655 656 657 658 659 660
template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
                                                  int *size) const;
template PD_INFER_DECL int64_t *Tensor::data<int64_t>(PlaceType *place,
                                                      int *size) const;
template PD_INFER_DECL int32_t *Tensor::data<int32_t>(PlaceType *place,
                                                      int *size) const;
template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
                                                      int *size) const;
template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
                                                    int *size) const;
661 662
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
                                                      int *size) const;
663 664
template PD_INFER_DECL bool *Tensor::data<bool>(PlaceType *place,
                                                int *size) const;
665

666
template PD_INFER_DECL double *Tensor::mutable_data<double>(PlaceType place);
667 668 669 670 671
template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
672
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
673
template PD_INFER_DECL bool *Tensor::mutable_data<bool>(PlaceType place);
674

675 676
Tensor::Tensor(void *scope, const void *device_contexts)
    : scope_{scope}, device_contexs_(device_contexts) {}
677

S
Steffy-zxf 已提交
678
template <typename T>
679
void *Tensor::FindTensor() const {
W
Wilber 已提交
680
  PADDLE_ENFORCE_EQ(
681 682
      name_.empty(),
      false,
683
      paddle::platform::errors::PreconditionNotMet(
W
Wilber 已提交
684 685
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
686
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
687
  auto *var = scope->FindVar(name_);
W
Wilber 已提交
688
  PADDLE_ENFORCE_NOT_NULL(
689 690 691
      var,
      paddle::platform::errors::PreconditionNotMet(
          "No tensor called [%s] in the runtime scope", name_));
S
Steffy-zxf 已提交
692
  auto *tensor = var->GetMutable<T>();
693 694 695
  return tensor;
}

696
std::vector<int> Tensor::shape() const {
697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716
#ifdef PADDLE_WITH_ONNXRUNTIME
  if (is_ort_tensor_) {
    std::vector<int> shape;
    // input handle
    if (idx_ < 0) {
      shape.assign(shape_.begin(), shape_.end());
    } else {  // output handle
      auto binding = binding_.lock();
      PADDLE_ENFORCE_NOT_NULL(binding,
                              paddle::platform::errors::PreconditionNotMet(
                                  "output tensor [%s] no binding ptr", name_));
      std::vector<Ort::Value> outputs = binding->GetOutputValues();
      Ort::Value &value = outputs[idx_];
      auto info = value.GetTensorTypeAndShapeInfo();
      auto ort_shape = info.GetShape();
      shape.assign(ort_shape.begin(), ort_shape.end());
    }
    return shape;
  }
#endif
717
  EAGER_GET_TENSOR(phi::DenseTensor);
W
Wilber 已提交
718
  PADDLE_ENFORCE_NOT_NULL(
719 720 721
      tensor_,
      paddle::platform::errors::PreconditionNotMet(
          "Not found tensor called %s in the scope", name_));
722
// oneDNN may does layout transform internally, so need to reorder before
W
wenbin 已提交
723 724
// return
#ifdef PADDLE_WITH_MKLDNN
725
  if (tensor->layout() == phi::DataLayout::ONEDNN) {
726 727
    phi::DataLayout out_layout =
        phi::OneDNNContext::tls().get_cur_paddle_data_layout();
W
wenbin 已提交
728
    // Set default as NCHW in case not specified
729 730
    out_layout = out_layout == phi::DataLayout::kAnyLayout
                     ? phi::DataLayout::kNCHW
W
wenbin 已提交
731 732 733 734 735 736
                     : out_layout;
    // In these data layouts, channel dimension is either on 2nd position: nChw
    // or
    // at last nhwC, so for dim==2 these layouts are the same and nothing should
    // be done. Similarly for dim==1 when you have just one possible
    // combination.
737
    if (tensor->dims().size() < 3) return phi::vectorize<int>(tensor->dims());
738 739
    if (out_layout == phi::DataLayout::kNHWC ||
        out_layout == phi::DataLayout::kNDHWC) {
740
      auto dims = phi::vectorize<int>(tensor->dims());
W
wenbin 已提交
741 742 743
      std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
      return dims;
    } else {
744
      return phi::vectorize<int>(tensor->dims());
W
wenbin 已提交
745 746 747
    }
  }
#endif
748
  return phi::vectorize<int>(tensor->dims());
749 750
}

751
void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
752
  EAGER_GET_TENSOR(phi::DenseTensor);
753
  paddle::framework::LoD lod;
754 755 756 757 758 759
  for (auto &level : x) {
    lod.emplace_back(level);
  }
  tensor->set_lod(lod);
}

760
std::vector<std::vector<size_t>> Tensor::lod() const {
761
  EAGER_GET_TENSOR(phi::DenseTensor);
762 763 764 765 766 767 768
  std::vector<std::vector<size_t>> res;
  for (auto &level : tensor->lod()) {
    res.emplace_back(level);
  }
  return res;
}

769 770 771 772
void Tensor::SetName(const std::string &name) { name_ = name; }

const std::string &Tensor::name() const { return name_; }

773 774 775
void Tensor::SetPlace(PlaceType place,
                      int device,
                      const std::string device_type) {
776 777
  place_ = place;
  device_ = device;
778
  device_type_ = device_type;
779 780
}

781 782 783 784 785 786 787
#ifdef PADDLE_WITH_ONNXRUNTIME
void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; }

void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
  binding_ = binding;
}

788 789 790 791 792 793 794 795 796 797 798
template <typename T>
T *Tensor::ORTGetMutableData() {
  auto binding = binding_.lock();
  PADDLE_ENFORCE_NOT_NULL(binding,
                          paddle::platform::errors::PreconditionNotMet(
                              "output tensor [%s] no binding ptr", name_));
  std::vector<Ort::Value> outputs = binding->GetOutputValues();
  Ort::Value &value = outputs[idx_];
  return value.GetTensorMutableData<T>();
}

799 800 801 802 803 804 805 806 807 808 809 810 811 812
template <typename T>
void Tensor::ORTCopyToCpu(T *data) const {
  auto binding = binding_.lock();
  PADDLE_ENFORCE_NOT_NULL(binding,
                          paddle::platform::errors::PreconditionNotMet(
                              "output tensor [%s] no binding ptr", name_));
  std::vector<Ort::Value> outputs = binding->GetOutputValues();
  Ort::Value &value = outputs[idx_];
  auto info = value.GetTensorTypeAndShapeInfo();
  size_t size = info.GetElementCount() * sizeof(T);

  if (place_ == PlaceType::kCPU) {
    std::memcpy(static_cast<void *>(data), value.GetTensorData<void *>(), size);
  } else {
H
heliqi 已提交
813 814 815
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "CopyToCpu error.The current ONNXRuntime backend doesn't support "
        "GPU."));
816 817 818 819 820 821 822 823 824 825
  }
}

template void Tensor::ORTCopyToCpu<float>(float *data) const;
template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
#endif

W
Wilber 已提交
826 827 828 829 830 831 832
namespace experimental {
template <typename T>
void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
                                            const T *data,
                                            cudaStream_t stream) {
  if (t->tensor_ == nullptr) {
    PADDLE_ENFORCE_EQ(
833 834
        t->name_.empty(),
        false,
W
Wilber 已提交
835 836 837 838 839 840
        paddle::platform::errors::PreconditionNotMet(
            "Need to SetName first, so that the corresponding tensor can "
            "be retrieved."));
    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
    auto *var = scope->FindVar(t->name_);
    PADDLE_ENFORCE_NOT_NULL(
841 842 843
        var,
        paddle::platform::errors::PreconditionNotMet(
            "No tensor called [%s] in the runtime scope", t->name_));
844
    auto *tensor = var->GetMutable<phi::DenseTensor>();
W
Wilber 已提交
845 846 847
    t->tensor_ = tensor;
  }

848
  auto *tensor = static_cast<phi::DenseTensor *>(t->tensor_);
849 850
  PADDLE_ENFORCE_GE(tensor->numel(),
                    0,
W
Wilber 已提交
851 852 853 854 855 856 857 858 859 860 861 862
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::vector<int> &shape)"
                        "function before copying data from cpu."));
  size_t ele_size = tensor->numel() * sizeof(T);
  if (t->place_ == PlaceType::kCPU) {
    auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
    std::memcpy(static_cast<void *>(t_data), data, ele_size);
  } else if (t->place_ == PlaceType::kGPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::CUDAPlace gpu_place(t->device_);
    auto *t_data = tensor->mutable_data<T>(gpu_place);
863 864 865 866 867 868
    paddle::memory::Copy(gpu_place,
                         static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(),
                         data,
                         ele_size,
                         stream);
W
Wilber 已提交
869 870 871 872 873 874 875 876 877 878 879 880
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
#endif
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "CopyFromCpuWithIoStream only supports CPU and GPU now."));
  }
}

template <typename T>
881 882
void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
                                          T *data,
W
Wilber 已提交
883 884 885
                                          cudaStream_t stream) {
  if (t->tensor_ == nullptr) {
    PADDLE_ENFORCE_EQ(
886 887
        t->name_.empty(),
        false,
W
Wilber 已提交
888 889 890 891 892 893
        paddle::platform::errors::PreconditionNotMet(
            "Need to SetName first, so that the corresponding tensor can "
            "be retrieved."));
    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
    auto *var = scope->FindVar(t->name_);
    PADDLE_ENFORCE_NOT_NULL(
894 895 896
        var,
        paddle::platform::errors::PreconditionNotMet(
            "No tensor called [%s] in the runtime scope", t->name_));
897
    auto *tensor = var->GetMutable<phi::DenseTensor>();
W
Wilber 已提交
898 899 900
    t->tensor_ = tensor;
  }

901
  auto *tensor = static_cast<phi::DenseTensor *>(t->tensor_);
W
Wilber 已提交
902 903 904 905 906 907
  auto ele_num = tensor->numel();
  auto *t_data = tensor->data<T>();
  auto t_place = tensor->place();

  if (paddle::platform::is_cpu_place(t_place)) {
#ifdef PADDLE_WITH_MKLDNN
908 909 910 911 912 913 914 915
    if (tensor->layout() == phi::DataLayout::ONEDNN) {
      phi::DenseTensor out;
      auto mem_allocation =
          std::make_shared<paddle::memory::allocation::Allocation>(
              static_cast<void *>(data),
              ele_num * sizeof(T),
              paddle::platform::CPUPlace());
      out.ResetHolder(mem_allocation);
916
      phi::funcs::TransDataLayoutFromOneDNN(
917
          tensor->layout(),
918
          phi::OneDNNContext::tls().get_cur_paddle_data_layout(),
919 920 921 922
          *tensor,
          &out,
          paddle::platform::CPUPlace(),
          true);
923
    } else {
W
Wilber 已提交
924
      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
925
    }
W
Wilber 已提交
926 927 928 929 930 931
#else
    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#endif
  } else if (t->place_ == PlaceType::kGPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::memory::Copy(paddle::platform::CPUPlace(),
932 933 934 935 936
                         static_cast<void *>(data),
                         t_place,
                         t_data,
                         ele_num * sizeof(T),
                         stream);
W
Wilber 已提交
937 938 939 940 941 942 943 944 945 946 947
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
#endif
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "CopyToCpuWithIoStream only supports CPU and GPU now."));
  }
}

948 949
template void InternalUtils::CopyFromCpuWithIoStream<double>(
    paddle_infer::Tensor *t, const double *data, cudaStream_t stream);
W
Wilber 已提交
950 951 952 953 954 955 956 957 958 959 960 961
template void InternalUtils::CopyFromCpuWithIoStream<float>(
    paddle_infer::Tensor *t, const float *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<int64_t>(
    paddle_infer::Tensor *t, const int64_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<int32_t>(
    paddle_infer::Tensor *t, const int32_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<uint8_t>(
    paddle_infer::Tensor *t, const uint8_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
    paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyFromCpuWithIoStream<float16>(
    paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
962 963
template void InternalUtils::CopyFromCpuWithIoStream<bool>(
    paddle_infer::Tensor *t, const bool *data, cudaStream_t stream);
W
Wilber 已提交
964

965 966
template void InternalUtils::CopyToCpuWithIoStream<double>(
    paddle_infer::Tensor *t, double *data, cudaStream_t stream);
W
Wilber 已提交
967 968 969 970 971 972 973 974 975 976 977 978
template void InternalUtils::CopyToCpuWithIoStream<float>(
    paddle_infer::Tensor *t, float *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<int64_t>(
    paddle_infer::Tensor *t, int64_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<int32_t>(
    paddle_infer::Tensor *t, int32_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<uint8_t>(
    paddle_infer::Tensor *t, uint8_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
    paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
template void InternalUtils::CopyToCpuWithIoStream<float16>(
    paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
979 980
template void InternalUtils::CopyToCpuWithIoStream<bool>(
    paddle_infer::Tensor *t, bool *data, cudaStream_t stream);
W
Wilber 已提交
981 982 983

}  // namespace experimental

984
}  // namespace paddle_infer