tensor_py.h 45.6 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14 15

#pragma once
16

L
Luo Tao 已提交
17
#include <Python.h>
18

W
wopeizl 已提交
19 20
#include <algorithm>
#include <memory>
Q
qijun 已提交
21
#include <string>
C
chengduoZH 已提交
22
#include <tuple>
23
#include <utility>
C
chengduoZH 已提交
24
#include <vector>
25

26
#include "paddle/fluid/framework/data_type.h"
Y
Yi Wang 已提交
27 28
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
29
#include "paddle/fluid/operators/eigen/eigen_function.h"
W
wopeizl 已提交
30
#include "paddle/fluid/operators/math/concat_and_split.h"
31
#include "paddle/fluid/platform/bfloat16.h"
32
#include "paddle/fluid/platform/device/device_wrapper.h"
33
#include "paddle/phi/kernels/funcs/strided_memcpy.h"
34
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
35 36
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
37
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
38
#include "paddle/fluid/framework/convert_utils.h"
Z
zyfncg 已提交
39
#include "paddle/fluid/framework/eigen.h"
Y
Yi Wang 已提交
40
#include "paddle/fluid/platform/device_context.h"
41
#include "paddle/fluid/platform/float16.h"
42
#include "paddle/fluid/platform/profiler/event_tracing.h"
43
#include "paddle/phi/common/pstring.h"
J
Jack Zhou 已提交
44 45
#include "paddle/phi/core/string_tensor.h"
#include "paddle/phi/kernels/strings/unicode.h"
Q
qijun 已提交
46 47
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
48

W
wopeizl 已提交
49 50
namespace py = pybind11;

51 52 53 54 55 56 57
namespace pybind11 {
namespace detail {

// Note: use same enum number of float16 in numpy.
// import numpy as np
// print np.dtype(np.float16).num  # 23
constexpr int NPY_FLOAT16_ = 23;
58
constexpr int NPY_UINT16_ = 4;
59 60
constexpr int NPY_COMPLEX64 = 14;
constexpr int NPY_COMPLEX128 = 15;
61

W
wanghuancoder 已提交
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
static py::array_t<T> CastNumpyType(py::array_t<S> array) {
  if (std::is_same<T, S>::value) {
    return array;
  }
  auto dim = array.ndim();
  std::vector<py::ssize_t> result_shape(dim);
  for (auto i = 0; i < dim; i++) {
    result_shape[i] = array.shape(i);
  }

  py::array_t<T> result(result_shape);

  return py::vectorize([](S s) { return static_cast<T>(s); })(array);
}

template <class T>
static py::array_t<T> CastNumpyArray(const py::object &array) {
  if (py::isinstance<py::array_t<float>>(array)) {
    return CastNumpyType<T>(array.cast<py::array_t<float>>());
  } else if (py::isinstance<py::array_t<double>>(array)) {
    return CastNumpyType<T>(array.cast<py::array_t<double>>());
  } else if (py::isinstance<py::array_t<int32_t>>(array)) {
    return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
    return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
  } else if (py::isinstance<py::array_t<bool>>(array)) {
    return CastNumpyType<T>(array.cast<py::array_t<bool>>());
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "Value type error. The assign numpy value allows integer, float, "
        "double and bool, "
        "but received %s.",
        Py_TYPE(array.ptr())->tp_name));
  }
  // can't reach here
  return py::array_t<T>();
}

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
// Note: Since float16 is not a builtin type in C++, we register
// paddle::platform::float16 as numpy.float16.
// Ref: https://github.com/pybind/pybind11/issues/1776
template <>
struct npy_format_descriptor<paddle::platform::float16> {
  static py::dtype dtype() {
    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
    return reinterpret_borrow<py::dtype>(ptr);
  }
  static std::string format() {
    // Note: "e" represents float16.
    // Details at:
    // https://docs.python.org/3/library/struct.html#format-characters.
    return "e";
  }
117
  static constexpr auto name = _("float16");
118 119
};

120 121 122 123 124 125 126 127 128 129 130 131 132 133
// Note: Since bfloat16 is not a builtin type in C++ and in numpy,
// we register paddle::platform::bfloat16 as numpy.uint16.
template <>
struct npy_format_descriptor<paddle::platform::bfloat16> {
  static py::dtype dtype() {
    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_UINT16_);
    return reinterpret_borrow<py::dtype>(ptr);
  }
  static std::string format() {
    // Note: "H" represents UINT16.
    // Details at:
    // https://docs.python.org/3/library/struct.html#format-characters.
    return "H";
  }
134
  static constexpr auto name = _("bfloat16");
135 136
};

137
// we register paddle::platform::complex<float> as numpy.complex64.
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
template <>
struct npy_format_descriptor<paddle::platform::complex<float>> {
  static py::dtype dtype() {
    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX64);
    return reinterpret_borrow<py::dtype>(ptr);
  }

  static std::string format() {
    // Note: "F" represents complex64.
    // Details at:
    // https://stackoverflow.com/questions/13997087/what-are-the-available-datatypes-for-dtype-with-numpys-loadtxt-an-genfromtx
    // for k, v in np.sctypeDict.iteritems():
    //     print '{0:14s} : {1:40s}'.format(str(k), v)
    return "F";
  }
  static constexpr auto name = _("complext64");
};

template <>
struct npy_format_descriptor<paddle::platform::complex<double>> {
  static py::dtype dtype() {
    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX128);
    return reinterpret_borrow<py::dtype>(ptr);
  }

  static std::string format() {
    // Note: "D" represents complex128.
    // Details at:
    // https://stackoverflow.com/questions/13997087/what-are-the-available-datatypes-for-dtype-with-numpys-loadtxt-an-genfromtx
    // for k, v in np.sctypeDict.iteritems():
    //     print '{0:14s} : {1:40s}'.format(str(k), v)
    return "D";
  }
  static constexpr auto name = _("complext128");
};

174 175 176
}  // namespace detail
}  // namespace pybind11

177
namespace paddle {
178
namespace pybind {
179

180 181
namespace details {

182 183 184 185
template <typename T>
class PYBIND11_HIDDEN NumpyAllocation : public memory::Allocation {
 public:
  explicit NumpyAllocation(const py::array &arr)
186 187
      : Allocation(const_cast<void *>(arr.data()),
                   sizeof(T) * (arr.size()),
188 189
                   paddle::platform::CPUPlace()),
        arr_(arr.ptr()) {
190 191 192 193
    PADDLE_ENFORCE_NOT_NULL(
        arr_,
        platform::errors::InvalidArgument("The underlying PyObject pointer of "
                                          "numpy array cannot be nullptr"));
194
    PADDLE_ENFORCE_NE(
195 196
        arr_,
        Py_None,
197 198 199 200 201 202 203 204 205 206 207 208 209
        platform::errors::PreconditionNotMet(
            "The underlying PyObject pointer of numpy array cannot be None"));
    Py_INCREF(arr_);
  }
  ~NumpyAllocation() override {
    py::gil_scoped_acquire gil;
    Py_DECREF(arr_);
  }

 private:
  PyObject *arr_;
};

210 211 212 213 214 215 216 217 218 219 220 221
template <typename T>
struct ValidDTypeToPyArrayChecker {
  static constexpr bool kValue = false;
};

#define DECLARE_VALID_DTYPE_TO_PY_ARRAY(type) \
  template <>                                 \
  struct ValidDTypeToPyArrayChecker<type> {   \
    static constexpr bool kValue = true;      \
  }

DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
222
DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
223 224
DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<float>);
DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex<double>);
225 226 227 228
DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
DECLARE_VALID_DTYPE_TO_PY_ARRAY(int8_t);
L
Leo Chen 已提交
229
DECLARE_VALID_DTYPE_TO_PY_ARRAY(int16_t);
230 231
DECLARE_VALID_DTYPE_TO_PY_ARRAY(int);
DECLARE_VALID_DTYPE_TO_PY_ARRAY(int64_t);
L
Leo Chen 已提交
232
DECLARE_VALID_DTYPE_TO_PY_ARRAY(uint8_t);
233 234 235 236 237 238 239

inline std::string TensorDTypeToPyDTypeStr(
    framework::proto::VarType::Type type) {
#define TENSOR_DTYPE_TO_PY_DTYPE(T, proto_type)                             \
  if (type == proto_type) {                                                 \
    if (std::is_same<T, platform::float16>::value) {                        \
      return "e";                                                           \
240 241 242
    } else if (std::is_same<T, platform::bfloat16>::value) {                \
      /* NumPy character code of uint16 due to no support for bfloat16 */   \
      return "H";                                                           \
243 244 245 246
    } else if (std::is_same<T, platform::complex<float>>::value) {          \
      return "F";                                                           \
    } else if (std::is_same<T, platform::complex<double>>::value) {         \
      return "D";                                                           \
247 248
    } else {                                                                \
      constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
249
      PADDLE_ENFORCE_EQ(                                                    \
250 251
          kIsValidDType,                                                    \
          true,                                                             \
252 253 254
          platform::errors::Unimplemented(                                  \
              "This type [%s] of tensor cannot be expose to Python",        \
              typeid(T).name()));                                           \
255 256 257 258 259 260
      return py::format_descriptor<T>::format();                            \
    }                                                                       \
  }

  _ForEachDataType_(TENSOR_DTYPE_TO_PY_DTYPE);
#undef TENSOR_DTYPE_TO_PY_DTYPE
261 262
  PADDLE_THROW(platform::errors::Unimplemented(
      "Unsupported tensor data type: %s", framework::DataTypeToString(type)));
263 264 265 266
}

}  // namespace details

267
template <typename T>
268
T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
269 270
  PADDLE_ENFORCE_LT(offset,
                    self.numel(),
271 272
                    platform::errors::InvalidArgument(
                        "The offset exceeds the size of tensor."));
273

Q
qingqing01 已提交
274
  T b = static_cast<T>(0);
275
  if (platform::is_cpu_place(self.place())) {
Q
qingqing01 已提交
276
    b = self.data<T>()[offset];
277 278 279
  } else if (platform::is_xpu_place(self.place())) {
#ifdef PADDLE_WITH_XPU
    const T *a = self.data<T>();
280
    auto p = self.place();
281 282 283
    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
#endif
  } else if (platform::is_gpu_place(self.place())) {
284
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
qingqing01 已提交
285
    const T *a = self.data<T>();
286
    auto p = self.place();
287 288
    paddle::memory::Copy(
        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
289 290 291 292
#endif
  } else if (platform::is_mlu_place(self.place())) {
#ifdef PADDLE_WITH_MLU
    const T *a = self.data<T>();
293
    auto p = self.place();
294 295
    paddle::memory::Copy(
        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
296 297 298 299
#endif
  } else if (platform::is_npu_place(self.place())) {
#if defined(PADDLE_WITH_ASCEND_CL)
    const T *a = self.data<T>();
300
    auto p = self.place();
301 302
    paddle::memory::Copy(
        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
303 304 305 306 307
#endif
  } else if (platform::is_custom_place(self.place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
    const T *a = self.data<T>();
    auto p = self.place();
308 309
    paddle::memory::Copy(
        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
Q
qingqing01 已提交
310
#endif
311
  }
312 313
  VLOG(10) << "TensorGetElement, place: " << self.place()
           << ", offset: " << offset << ", element: " << b;
Q
qingqing01 已提交
314
  return b;
315 316 317
}

template <typename T>
318
void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
319 320
  PADDLE_ENFORCE_LT(offset,
                    self->numel(),
321 322
                    platform::errors::InvalidArgument(
                        "The offset exceeds the size of tensor."));
323 324
  VLOG(10) << "TensorSetElement, place: " << self->place()
           << ", offset: " << offset << ", element: " << elem;
Q
qingqing01 已提交
325
  if (platform::is_cpu_place(self->place())) {
Y
Yu Yang 已提交
326
    self->mutable_data<T>(self->place())[offset] = elem;
327 328
  } else if (platform::is_xpu_place(self->place())) {
#ifdef PADDLE_WITH_XPU
329
    auto p = self->place();
330 331 332 333
    T *a = self->mutable_data<T>(p);
    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
#endif
  } else if (platform::is_gpu_place(self->place())) {
334
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
335
    auto p = self->place();
Q
qingqing01 已提交
336
    T *a = self->mutable_data<T>(p);
337 338
    paddle::memory::Copy(
        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
339 340 341
#endif
  } else if (platform::is_mlu_place(self->place())) {
#ifdef PADDLE_WITH_MLU
342
    auto p = self->place();
343
    T *a = self->mutable_data<T>(p);
344 345
    paddle::memory::Copy(
        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
346 347 348
#endif
  } else if (platform::is_npu_place(self->place())) {
#if defined(PADDLE_WITH_ASCEND_CL)
349
    auto p = self->place();
350
    T *a = self->mutable_data<T>(p);
351 352
    paddle::memory::Copy(
        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
353 354 355 356 357
#endif
  } else if (platform::is_custom_place(self->place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
    auto p = self->place();
    T *a = self->mutable_data<T>(p);
358 359
    paddle::memory::Copy(
        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
Q
qingqing01 已提交
360
#endif
361
  }
362 363
}

364 365
template <typename T, typename P>
void SetTensorFromPyArrayT(
366
    phi::DenseTensor *self,
367
    const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
368 369
    const P &place,
    bool zero_copy) {
370 371 372
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
373
    dims.push_back(static_cast<int64_t>(array.shape()[i]));
374
  }
375
  self->Resize(phi::make_ddim(dims));
376 377

  if (paddle::platform::is_cpu_place(place)) {
378 379 380
    if (zero_copy) {
      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
      auto type = framework::ToDataType(std::type_index(typeid(T)));
381
      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
382 383 384 385
    } else {
      auto dst = self->mutable_data<T>(place);
      std::memcpy(dst, array.data(), array.nbytes());
    }
386 387
  } else if (paddle::platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
W
WangXi 已提交
388 389 390
    // NOTE(wangxi): When copying data to the accelerator card,
    // we need set_device(dev_id) first.
    platform::Place tmp_place = place;
391
    platform::XPUDeviceGuard guard(tmp_place.device);
392
    auto dst = self->mutable_data<T>(place);
393 394 395 396 397
    memory::Copy(tmp_place,
                 static_cast<void *>(dst),
                 platform::CPUPlace(),
                 static_cast<const void *>(array.data()),
                 array.nbytes());
398 399 400 401
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use XPUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with XPU support."));
J
jianghaicheng 已提交
402 403 404 405 406 407
#endif
  } else if (paddle::platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
    if (zero_copy) {
      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
      auto type = framework::ToDataType(std::type_index(typeid(T)));
408
      self->ResetHolderWithType(holder, framework::TransToPhiDataType(type));
J
jianghaicheng 已提交
409
    } else {
410 411 412 413 414 415 416 417
      // IPU does not store Tensor data, Tensor will be created on CPU
      if (!self->initialized()) {
        auto dst = self->mutable_data<T>(place);
        std::memcpy(dst, array.data(), array.nbytes());
      } else {
        auto dst = self->mutable_data<T>(self->place());
        std::memcpy(dst, array.data(), array.nbytes());
      }
J
jianghaicheng 已提交
418 419 420 421 422
    }
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
        "Please recompile or reinstall Paddle with IPU support."));
423 424 425 426
#endif
  } else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
    platform::Place tmp_place = place;
427
    platform::NPUDeviceGuard guard(tmp_place.device);
428
    auto dst = self->mutable_data<T>(place);
429 430
    platform::NPUMemcpySync(
        dst, array.data(), array.nbytes(), ACL_MEMCPY_HOST_TO_DEVICE);
431 432 433 434 435 436 437
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(place);
    ctx.Wait();
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use NPUPlace in CPU/GPU/XPU version. "
        "Please recompile or reinstall Paddle with NPU support."));
438 439 440 441
#endif
  } else if (paddle::platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
    platform::Place tmp_place = place;
442
    platform::MLUDeviceGuard guard(tmp_place.device);
443
    auto dst = self->mutable_data<T>(place);
F
fwenguang 已提交
444 445 446 447 448
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto dev_ctx = static_cast<platform::MLUDeviceContext *>(pool.Get(place));
    paddle::platform::MLUMemcpyH2DAsync(
        dst, array.data(), array.nbytes(), dev_ctx->stream());
    dev_ctx->Wait();
449 450 451 452
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use MLUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with MLU support."));
453 454 455 456
#endif
  } else if (paddle::platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    platform::Place tmp_place = place;
457
    phi::DeviceGuard guard(tmp_place);
458 459
    auto dst = self->mutable_data<T>(place);

460
    phi::DeviceManager::GetDeviceWithPlace(tmp_place)->MemoryCopyH2D(
461 462 463 464 465 466 467 468 469 470
        reinterpret_cast<void *>(dst),
        const_cast<void *>(reinterpret_cast<const void *>(array.data())),
        array.nbytes());
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(place);
    ctx.Wait();
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use CustomDevice in CPU/GPU/XPU version. "
        "Please recompile or reinstall Paddle with CustomDevice support."));
471
#endif
472
  } else {
473
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
474
    if (paddle::platform::is_gpu_place(place)) {
W
WangXi 已提交
475 476
      // NOTE(wangxi): When copying data to the accelerator card,
      // we need set_device(dev_id) first.
477
      platform::CUDADeviceGuard guard(place.device);
478
      auto dst = self->mutable_data<T>(place);
479
#ifdef PADDLE_WITH_HIP
480 481
      paddle::platform::GpuMemcpySync(
          dst, array.data(), array.nbytes(), hipMemcpyHostToDevice);
482
#else
483 484
      paddle::platform::GpuMemcpySync(
          dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice);
485
#endif
486

487 488 489
    } else if (paddle::platform::is_cuda_pinned_place(place)) {
      auto dst = self->mutable_data<T>(place);
      std::memcpy(dst, array.data(), array.nbytes());
490
    } else {
491 492 493
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Incompatible place type: Tensor.set() supports "
          "CPUPlace, CUDAPlace "
494
          "and CUDAPinnedPlace, but got %s!",
495
          place));
496 497
    }
#else
498
    PADDLE_THROW(platform::errors::PermissionDenied(
499
        "Cannot use CUDAPlace or CUDAPinnedPlace in CPU only version, "
500
        "Please recompile or reinstall Paddle with CUDA support."));
501 502 503 504 505
#endif
  }
}

template <typename P>
506
void SetTensorFromPyArray(phi::DenseTensor *self,
507 508 509
                          const py::object &obj,
                          const P &place,
                          bool zero_copy) {
510
  auto array = obj.cast<py::array>();
511
  if (py::isinstance<py::array_t<float>>(array)) {
512
    SetTensorFromPyArrayT<float, P>(self, array, place, zero_copy);
513
  } else if (py::isinstance<py::array_t<int>>(array)) {
514
    SetTensorFromPyArrayT<int, P>(self, array, place, zero_copy);
515
  } else if (py::isinstance<py::array_t<int64_t>>(array)) {
516
    SetTensorFromPyArrayT<int64_t, P>(self, array, place, zero_copy);
517
  } else if (py::isinstance<py::array_t<double>>(array)) {
518
    SetTensorFromPyArrayT<double, P>(self, array, place, zero_copy);
519
  } else if (py::isinstance<py::array_t<int8_t>>(array)) {
520
    SetTensorFromPyArrayT<int8_t, P>(self, array, place, zero_copy);
L
Leo Chen 已提交
521 522
  } else if (py::isinstance<py::array_t<int16_t>>(array)) {
    SetTensorFromPyArrayT<int16_t, P>(self, array, place, zero_copy);
523
  } else if (py::isinstance<py::array_t<uint8_t>>(array)) {
524
    SetTensorFromPyArrayT<uint8_t, P>(self, array, place, zero_copy);
525
  } else if (py::isinstance<py::array_t<paddle::platform::float16>>(array)) {
526 527
    SetTensorFromPyArrayT<paddle::platform::float16, P>(
        self, array, place, zero_copy);
528 529 530 531 532 533 534 535
  } else if (py::isinstance<py::array_t<paddle::platform::complex<float>>>(
                 array)) {
    SetTensorFromPyArrayT<paddle::platform::complex<float>, P>(
        self, array, place, zero_copy);
  } else if (py::isinstance<py::array_t<paddle::platform::complex<double>>>(
                 array)) {
    SetTensorFromPyArrayT<paddle::platform::complex<double>, P>(
        self, array, place, zero_copy);
536
  } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
537 538
    // since there is still no support for bfloat16 in NumPy,
    // uint16 is used for casting bfloat16
539 540
    SetTensorFromPyArrayT<paddle::platform::bfloat16, P>(
        self, array, place, zero_copy);
541
  } else if (py::isinstance<py::array_t<bool>>(array)) {
542
    SetTensorFromPyArrayT<bool, P>(self, array, place, zero_copy);
543
  } else {
544 545
    // obj may be any type, obj.cast<py::array>() may be failed,
    // then the array.dtype will be string of unknown meaning,
546
    PADDLE_THROW(platform::errors::InvalidArgument(
547 548 549 550
        "Input object type error or incompatible array data type. "
        "tensor.set() supports array with bool, float16, float32, "
        "float64, int8, int16, int32, int64, uint8 or uint16, "
        "please check your input or input array data type."));
551 552 553
  }
}

J
Jack Zhou 已提交
554
template <typename P>
555 556
void SetStringTensorFromPyArray(phi::StringTensor *self,
                                const py::array &array,
J
Jack Zhou 已提交
557 558 559
                                const P &place) {
  bool is_string_pyarray =
      array.dtype().kind() == 'S' || array.dtype().kind() == 'U';
560 561
  PADDLE_ENFORCE_EQ(is_string_pyarray,
                    true,
J
Jack Zhou 已提交
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
                    platform::errors::InvalidArgument(
                        "Expect the dtype of numpy array is string or "
                        "unicode, but recevie dtype %s",
                        array.dtype()));
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  dims.reserve(array.ndim());
  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }
  self->Resize(phi::make_ddim(dims));
  auto itemsize = array.itemsize();
  if (paddle::platform::is_cpu_place(place)) {
    auto dst = self->mutable_data(place);
    if (array.dtype().kind() == 'S') {
      for (int i = 0; i < self->numel(); ++i) {
        dst[i] =
            pstring(reinterpret_cast<const char *>(array.data()) + itemsize * i,
                    itemsize);
      }
    } else {
      // array.dtype().kind() == 'U'
      VLOG(6) << "numpy array itemsize: " << itemsize;
      for (int i = 0; i < self->numel(); ++i) {
        // Note(zhoushunjie): The itemsize of unicode numpy array is the
        // the size of each unicode string. Each unicode string is aligned
        // to max length of the array of unicode strings, so the size of
        // each unicode string is same. The size of each unicode character is
        // 4, so the size of unicode string is 4 times of the length of
        // unicode string.
        auto unicode_len = itemsize / 4;
        auto utf8_len = phi::strings::GetUTF8StrLen(
            reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
            unicode_len);
        pstring pstr(utf8_len - 1, 0);
        phi::strings::GetUTF8Str(
            reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
599 600
            pstr.mdata(),
            unicode_len);
J
Jack Zhou 已提交
601 602 603 604 605 606 607 608 609 610
        dst[i] = pstr;
      }
    }
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "StringTensor only support CPUPlace now, but receive %s",
        place.DebugString()));
  }
}

S
Siming Dai 已提交
611
template <typename T>
S
Siming Dai 已提交
612
void SetUVATensorFromPyArrayImpl(
613
    phi::DenseTensor *self_tensor,
S
Siming Dai 已提交
614 615
    const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
    int device_id) {
S
Siming Dai 已提交
616
#if defined(PADDLE_WITH_CUDA)
617
  VLOG(4) << "Running in SetUVATensorFromPyArrayImpl.";
S
Siming Dai 已提交
618 619 620 621
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  int64_t numel = 1;
  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
622 623
    dims.emplace_back(static_cast<int64_t>(array.shape()[i]));
    numel *= static_cast<int64_t>(array.shape()[i]);
S
Siming Dai 已提交
624
  }
625
  self_tensor->Resize(phi::make_ddim(dims));
S
Siming Dai 已提交
626 627 628 629

  auto data_type = framework::ToDataType(std::type_index(typeid(T)));
  const auto &need_allocate_size = numel * framework::SizeOfType(data_type);
  T *data_ptr;
630 631
  cudaHostAlloc(reinterpret_cast<void **>(&data_ptr),
                need_allocate_size,
S
Siming Dai 已提交
632 633 634 635 636
                cudaHostAllocWriteCombined | cudaHostAllocMapped);
  std::memcpy(data_ptr, array.data(), array.nbytes());

  void *cuda_device_pointer = nullptr;
  cudaHostGetDevicePointer(reinterpret_cast<void **>(&cuda_device_pointer),
637 638
                           reinterpret_cast<void *>(data_ptr),
                           0);
S
Siming Dai 已提交
639 640
  std::shared_ptr<memory::allocation::Allocation> holder =
      std::make_shared<memory::allocation::Allocation>(
641 642
          cuda_device_pointer,
          need_allocate_size,
S
Siming Dai 已提交
643
          platform::CUDAPlace(device_id));
644
  self_tensor->ResetHolderWithType(holder,
645
                                   framework::TransToPhiDataType(data_type));
S
Siming Dai 已提交
646 647 648
#endif
}

649 650 651
template <typename T>
void SetUVATensorFromPyArray(
    const std::shared_ptr<paddle::imperative::VarBase> &self,
S
Siming Dai 已提交
652
    const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
653
    int device_id) {
654 655
#if defined(PADDLE_WITH_CUDA)
  VLOG(4) << "Running in SetUVATensorFromPyArray for VarBase.";
656
  auto *self_tensor = self->MutableVar()->GetMutable<phi::DenseTensor>();
657 658 659 660 661 662 663
  SetUVATensorFromPyArrayImpl<T>(self_tensor, array, device_id);
#endif
}

template <typename T>
void SetUVATensorFromPyArray(
    const std::shared_ptr<paddle::experimental::Tensor> &self,
664 665
    const py::array_t<T> &array,
    int device_id) {
666 667 668 669 670 671 672 673 674 675
#if defined(PADDLE_WITH_CUDA)
  VLOG(4) << "Running in SetUVATensorFromPyArray for Phi::Tensor.";
  phi::DenseTensorMeta meta =
      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
  std::shared_ptr<phi::DenseTensor> tmp_t = std::make_shared<phi::DenseTensor>(
      std::make_unique<paddle::experimental::DefaultAllocator>(
          paddle::platform::CPUPlace())
          .get(),
      meta);
  self.get()->set_impl(tmp_t);
676
  auto *self_tensor = static_cast<phi::DenseTensor *>(self.get()->impl().get());
677 678 679 680 681

  SetUVATensorFromPyArrayImpl<T>(self_tensor, array, device_id);
#endif
}

W
wopeizl 已提交
682
template <typename T, size_t D>
683 684
void _sliceCompute(const phi::DenseTensor *in,
                   phi::DenseTensor *out,
L
Leo Chen 已提交
685
                   const phi::CPUContext &ctx,
W
wopeizl 已提交
686 687 688 689 690 691
                   const std::vector<int> &axes,
                   const std::vector<int> &starts) {
  auto &eigen_place = *ctx.eigen_device();
  auto out_dims = out->dims();
  auto in_dims = in->dims();

692 693
  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
W
wopeizl 已提交
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
  for (size_t i = 0; i < D; ++i) {
    offsets[i] = 0;
    extents[i] = out_dims[i];
  }
  int start;
  for (size_t i = 0; i < axes.size(); ++i) {
    start = starts[i];
    if (start < 0) {
      start = (start + in_dims[axes[i]]);
    }
    start = std::max(start, 0);
    offsets[axes[i]] = start;
  }
  auto in_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *in);
  auto out_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *out);
713 714
  operators::EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
      eigen_place, out_t, in_t, offsets, extents);
W
wopeizl 已提交
715 716 717
}

template <typename T>
718 719
void _concatCompute(const std::vector<phi::DenseTensor> &ins,
                    phi::DenseTensor *out,
L
Leo Chen 已提交
720
                    const phi::CPUContext &ctx,
721
                    int64_t axis) {
W
wopeizl 已提交
722 723 724
  if (axis == 0 && ins.size() < 10) {
    size_t output_offset = 0;
    for (auto &in : ins) {
725 726
      auto in_stride = phi::stride_numel(in.dims());
      auto out_stride = phi::stride_numel(out->dims());
727 728 729 730 731 732 733
      phi::funcs::StridedNumelCopyWithAxis<T>(ctx,
                                              axis,
                                              out->data<T>() + output_offset,
                                              out_stride,
                                              in.data<T>(),
                                              in_stride,
                                              in_stride[axis]);
W
wopeizl 已提交
734 735 736
      output_offset += in_stride[axis];
    }
  } else {
L
Leo Chen 已提交
737
    paddle::operators::math::ConcatFunctor<phi::CPUContext, T> concat_functor;
W
wopeizl 已提交
738 739 740 741
    concat_functor(ctx, ins, static_cast<int>(axis), out);
  }
}

742
inline void _getSliceinfo(const phi::DenseTensor &self,
743 744 745 746 747 748
                          py::object obj,
                          const int64_t dim,
                          int64_t *pstart,
                          int64_t *pstop,
                          int64_t *pstep,
                          int64_t *pslicelength) {
W
wopeizl 已提交
749 750 751 752 753
  auto &start = *pstart;
  auto &stop = *pstop;
  auto &step = *pstep;
  auto &slicelength = *pslicelength;
  const framework::DDim &srcDDim = self.dims();
Z
zyfncg 已提交
754 755 756 757
  PADDLE_ENFORCE(
      0 <= dim && dim < srcDDim.size(),
      platform::errors::OutOfRange("The dim %d of slice is out of bounds, it "
                                   "shound be in the range of [0, %d).",
758 759
                                   dim,
                                   srcDDim.size()));
Z
zyfncg 已提交
760

W
wopeizl 已提交
761 762 763 764
  if (py::isinstance<py::slice>(obj)) {
    size_t lstart, lstop, lstep, lslicelength;
    py::slice s = static_cast<py::slice>(obj);
    if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) {
Z
zyfncg 已提交
765 766 767 768
      PADDLE_THROW(platform::errors::OutOfRange(
          "Slice on dim: %d is error, please check the validity of tensor "
          "dims or slice item.",
          dim));
W
wopeizl 已提交
769 770 771 772 773 774 775
    }
    start = static_cast<int64_t>(lstart);
    stop = static_cast<int64_t>(lstop);
    step = static_cast<int64_t>(lstep);
    slicelength = static_cast<int64_t>(lslicelength);
  } else if (py::isinstance<py::int_>(obj)) {
    start = static_cast<int64_t>(static_cast<py::int_>(obj));
Z
zyfncg 已提交
776 777 778 779
    PADDLE_ENFORCE(
        std::abs(start) < srcDDim[dim],
        platform::errors::OutOfRange("The start %d of slice is out of bounds, "
                                     "it shound be in the range of (%d, %d).",
780 781 782
                                     start,
                                     -srcDDim[dim],
                                     srcDDim[dim]));
W
wopeizl 已提交
783 784 785 786 787
    start = (start >= 0) ? start : srcDDim[dim] - start;
    stop = start + 1;
    step = 1;
    slicelength = 1;
  } else {
Z
zyfncg 已提交
788 789 790
    PADDLE_THROW(
        platform::errors::OutOfRange("Index object error, the index object for "
                                     "slice only supports slice(::) and int."));
W
wopeizl 已提交
791 792 793
  }
}

794 795 796
inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
                                    const framework::DDim &ddim) {
  phi::DenseTensor *output = new phi::DenseTensor();
W
wopeizl 已提交
797 798 799
  output->Resize(ddim);
  auto place = self.place();
  if (platform::is_cpu_place(place)) {
800
    output->mutable_data(place, self.dtype());
801 802
  } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU
803
    output->mutable_data(place, self.dtype());
804 805 806
#endif
  } else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
807
    output->mutable_data(place, self.dtype());
808
#endif
W
wopeizl 已提交
809
  } else {
810
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
W
wopeizl 已提交
811
    if (platform::is_cuda_pinned_place(place)) {
812
      output->mutable_data(place, self.dtype());
W
wopeizl 已提交
813
    } else if ((platform::is_gpu_place(place))) {
814
      output->mutable_data(place, self.dtype());
W
wopeizl 已提交
815 816 817 818 819 820 821
    }
#endif
  }
  return output;
}

template <typename T>
822 823
void _sliceDapper(const phi::DenseTensor *in,
                  phi::DenseTensor *out,
L
Leo Chen 已提交
824
                  const phi::CPUContext &ctx,
825 826
                  const std::vector<int> &axes,
                  const std::vector<int> &starts,
W
wopeizl 已提交
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
                  int size) {
  switch (size) {
    case 1:
      _sliceCompute<T, 1>(in, out, ctx, axes, starts);
      break;
    case 2:
      _sliceCompute<T, 2>(in, out, ctx, axes, starts);
      break;
    case 3:
      _sliceCompute<T, 3>(in, out, ctx, axes, starts);
      break;
    case 4:
      _sliceCompute<T, 4>(in, out, ctx, axes, starts);
      break;
    case 5:
      _sliceCompute<T, 5>(in, out, ctx, axes, starts);
      break;
    case 6:
      _sliceCompute<T, 6>(in, out, ctx, axes, starts);
      break;
    case 7:
      _sliceCompute<T, 7>(in, out, ctx, axes, starts);
      break;
    case 8:
      _sliceCompute<T, 8>(in, out, ctx, axes, starts);
      break;
    case 9:
      _sliceCompute<T, 9>(in, out, ctx, axes, starts);
      break;
    default:
857 858
      PADDLE_THROW(platform::errors::InvalidArgument(
          "The dim size should be 1 to 9, current is %d", size));
W
wopeizl 已提交
859 860 861 862 863
      break;
  }
}

template <typename T>
864 865 866 867 868 869
inline phi::DenseTensor *_sliceWrapper(const phi::DenseTensor &self,
                                       const phi::CPUContext &ctx,
                                       py::object obj,
                                       int dim,
                                       int64_t start,
                                       int64_t slicelength) {
W
wopeizl 已提交
870 871 872 873
  framework::DDim dstDDim = self.dims();
  dstDDim[dim] = static_cast<int64_t>(slicelength);
  std::vector<int> axes({dim});
  std::vector<int> starts({static_cast<int>(start)});
874
  phi::DenseTensor *output = _getTensor(self, dstDDim);
W
wopeizl 已提交
875 876 877 878 879
  _sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size());
  return output;
}

template <typename T>
880 881 882
inline phi::DenseTensor *_sliceAndConcat(const phi::DenseTensor &self,
                                         py::object obj,
                                         int dim) {
L
Leo Chen 已提交
883
  phi::CPUContext ctx;
W
wopeizl 已提交
884 885 886 887 888
  int64_t start, stop, step, slicelength;
  _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
  if (step == 1 || slicelength == 1) {
    return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
  } else {
889
    std::vector<phi::DenseTensor> ins;
W
wopeizl 已提交
890 891 892 893 894 895 896
    for (auto i = 0; i < slicelength; ++i, start += step) {
      ins.emplace_back(*_sliceWrapper<T>(self, ctx, obj, dim, start, 1));
    }

    // do the concat operation
    framework::DDim dstDDim = self.dims();
    dstDDim[dim] = static_cast<int64_t>(slicelength);
897
    phi::DenseTensor *output1 = _getTensor(self, dstDDim);
W
wopeizl 已提交
898 899 900 901 902
    _concatCompute<T>(ins, output1, ctx, dim);
    return output1;
  }
}

903 904 905
inline phi::DenseTensor *_sliceTensor(const phi::DenseTensor &self,
                                      py::object obj,
                                      int dim) {
906
  auto src_type = framework::TransToProtoVarType(self.dtype());
W
wopeizl 已提交
907 908 909
  switch (src_type) {
    case framework::proto::VarType::FP16:
      return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
910 911
    case framework::proto::VarType::BF16:
      return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
912
    case framework::proto::VarType::COMPLEX64:
913
      return _sliceAndConcat<paddle::platform::complex<float>>(self, obj, dim);
914
    case framework::proto::VarType::COMPLEX128:
915
      return _sliceAndConcat<paddle::platform::complex<double>>(self, obj, dim);
W
wopeizl 已提交
916 917 918 919
    case framework::proto::VarType::FP32:
      return _sliceAndConcat<float>(self, obj, dim);
    case framework::proto::VarType::FP64:
      return _sliceAndConcat<double>(self, obj, dim);
L
Leo Chen 已提交
920 921 922 923
    case framework::proto::VarType::INT8:
      return _sliceAndConcat<int8_t>(self, obj, dim);
    case framework::proto::VarType::INT16:
      return _sliceAndConcat<int16_t>(self, obj, dim);
W
wopeizl 已提交
924 925 926 927 928 929 930
    case framework::proto::VarType::INT32:
      return _sliceAndConcat<int>(self, obj, dim);
    case framework::proto::VarType::INT64:
      return _sliceAndConcat<int64_t>(self, obj, dim);
    case framework::proto::VarType::BOOL:
      return _sliceAndConcat<bool>(self, obj, dim);
    case framework::proto::VarType::UINT8:
L
Leo Chen 已提交
931
      return _sliceAndConcat<uint8_t>(self, obj, dim);
W
wopeizl 已提交
932
    default:
933 934 935
      PADDLE_THROW(platform::errors::InvalidArgument(
          "Not support tensor type: %s",
          framework::DataTypeToString(src_type)));
W
wopeizl 已提交
936 937 938
  }
}

939 940
inline phi::DenseTensor *_pySliceTensor(const phi::DenseTensor &self,
                                        py::object obj) {
W
wopeizl 已提交
941 942
  if (py::isinstance<py::tuple>(obj)) {
    py::list l = static_cast<py::list>(obj);
943 944
    std::unique_ptr<phi::DenseTensor> target;
    phi::DenseTensor *src = const_cast<phi::DenseTensor *>(&self);
W
wopeizl 已提交
945 946 947 948 949 950 951 952 953 954 955 956 957 958
    for (auto i = 0; i < static_cast<int>(l.size()); ++i) {
      src = _sliceTensor(*src, l[i], i);
      if (i + 1 == static_cast<int>(l.size())) {
        return src;
      } else {
        target.reset(src);
      }
    }
    return nullptr;
  } else {
    return _sliceTensor(self, obj, 0);
  }
}

959 960
inline phi::DenseTensor *PySliceTensor(const phi::DenseTensor &self,
                                       py::object obj) {
W
wopeizl 已提交
961
  if (platform::is_gpu_place(self.place())) {
962 963
    std::unique_ptr<phi::DenseTensor> holder;
    phi::DenseTensor src;
W
wopeizl 已提交
964
    framework::TensorCopySync(self, platform::CPUPlace(), &src);
965
    phi::DenseTensor *output = _pySliceTensor(src, obj);
W
wopeizl 已提交
966
    holder.reset(output);
967
    phi::DenseTensor *dst = _getTensor(*output, output->dims());
W
wopeizl 已提交
968 969 970 971 972 973 974
    framework::TensorCopySync(*output, self.place(), dst);
    return dst;
  } else {
    return _pySliceTensor(self, obj);
  }
}

975
inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
976
                                 bool need_deep_copy = false) {
Q
qingqing01 已提交
977 978 979
  if (!tensor.IsInitialized()) {
    return py::array();
  }
980
  bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
981
  bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
982
  bool is_npu_tensor = platform::is_npu_place(tensor.place());
983
  bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
984
  bool is_custom_device_tensor = platform::is_custom_place(tensor.place());
985
  const auto &tensor_dims = tensor.dims();
986
  auto tensor_dtype = framework::TransToProtoVarType(tensor.dtype());
987 988 989 990 991 992 993
  size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);

  std::vector<size_t> py_dims(tensor_dims.size());
  std::vector<size_t> py_strides(tensor_dims.size());

  size_t numel = 1;
  for (int i = tensor_dims.size() - 1; i >= 0; --i) {
994
    py_dims[i] = static_cast<size_t>(tensor_dims[i]);
995 996 997 998
    py_strides[i] = sizeof_dtype * numel;
    numel *= py_dims[i];
  }

999
  const void *tensor_buf_ptr = tensor.data();
1000

1001 1002
  std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(
      framework::TransToProtoVarType(tensor.dtype()));
1003

1004 1005
  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor &&
      !is_custom_device_tensor) {
1006
    if (!need_deep_copy) {
1007
      auto base = py::cast(std::move(tensor));
1008 1009 1010 1011 1012
      return py::array(py::dtype(py_dtype_str.c_str()),
                       py_dims,
                       py_strides,
                       const_cast<void *>(tensor_buf_ptr),
                       base);
1013 1014
    } else {
      py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1015
      PADDLE_ENFORCE_EQ(
1016 1017
          py_arr.writeable(),
          true,
1018 1019 1020 1021
          platform::errors::InvalidArgument(
              "PyArray is not writable, in which case memory leak "
              "or double free would occur"));
      PADDLE_ENFORCE_EQ(
1022 1023
          py_arr.owndata(),
          true,
1024 1025 1026
          platform::errors::InvalidArgument(
              "PyArray does not own data, in which case  memory leak "
              "or double free would occur"));
1027 1028
      platform::CPUPlace place;
      size_t copy_bytes = sizeof_dtype * numel;
1029 1030
      paddle::memory::Copy(
          place, py_arr.mutable_data(), place, tensor_buf_ptr, copy_bytes);
1031 1032
      return py_arr;
    }
1033 1034 1035
  } else if (is_xpu_tensor) {
#ifdef PADDLE_WITH_XPU
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1036 1037
    PADDLE_ENFORCE_EQ(py_arr.writeable(),
                      true,
1038 1039 1040 1041
                      platform::errors::InvalidArgument(
                          "PyArray is not writable, in which case memory leak "
                          "or double free would occur"));
    PADDLE_ENFORCE_EQ(
1042 1043
        py_arr.owndata(),
        true,
1044 1045 1046 1047 1048
        platform::errors::InvalidArgument(
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));

    size_t copy_bytes = sizeof_dtype * numel;
1049
    auto p = tensor.place();
1050 1051 1052 1053 1054
    paddle::memory::Copy(platform::CPUPlace(),
                         py_arr.mutable_data(),
                         p,
                         tensor_buf_ptr,
                         copy_bytes);
1055 1056 1057 1058 1059 1060 1061
    return py_arr;
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use XPUPlace in CPU/GPU version, "
        "Please recompile or reinstall Paddle with XPU support."));
#endif
  } else if (is_gpu_tensor) {
1062
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1063
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1064 1065
    PADDLE_ENFORCE_EQ(py_arr.writeable(),
                      true,
1066 1067 1068 1069
                      platform::errors::InvalidArgument(
                          "PyArray is not writable, in which case memory leak "
                          "or double free would occur"));
    PADDLE_ENFORCE_EQ(
1070 1071
        py_arr.owndata(),
        true,
1072 1073 1074 1075 1076
        platform::errors::InvalidArgument(
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));

    size_t copy_bytes = sizeof_dtype * numel;
1077
    auto p = tensor.place();
1078 1079 1080 1081 1082 1083
    paddle::memory::Copy(platform::CPUPlace(),
                         py_arr.mutable_data(),
                         p,
                         tensor_buf_ptr,
                         copy_bytes,
                         nullptr);
1084
    return py_arr;
1085
#else
1086 1087 1088
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use CUDAPlace in CPU only version, "
        "Please recompile or reinstall Paddle with CUDA support."));
1089 1090 1091 1092
#endif
  } else if (is_npu_tensor) {
#ifdef PADDLE_WITH_ASCEND_CL
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1093 1094
    PADDLE_ENFORCE_EQ(py_arr.writeable(),
                      true,
1095 1096 1097 1098
                      platform::errors::InvalidArgument(
                          "PyArray is not writable, in which case memory leak "
                          "or double free would occur"));
    PADDLE_ENFORCE_EQ(
1099 1100
        py_arr.owndata(),
        true,
1101 1102 1103 1104 1105
        platform::errors::InvalidArgument(
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));

    size_t copy_bytes = sizeof_dtype * numel;
1106
    auto p = tensor.place();
1107 1108 1109
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(tensor.place());
    paddle::memory::Copy(
1110 1111 1112 1113
        platform::CPUPlace(),
        py_arr.mutable_data(),
        p,
        tensor_buf_ptr,
1114 1115 1116 1117 1118 1119 1120 1121
        copy_bytes,
        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
    ctx.Wait();
    return py_arr;
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use NPUPlace in CPU/GPU/XPU version, "
        "Please recompile or reinstall Paddle with NPU support."));
1122 1123 1124 1125
#endif
  } else if (is_mlu_tensor) {
#ifdef PADDLE_WITH_MLU
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1126 1127
    PADDLE_ENFORCE_EQ(py_arr.writeable(),
                      true,
1128 1129 1130 1131
                      platform::errors::InvalidArgument(
                          "PyArray is not writable, in which case memory leak "
                          "or double free would occur"));
    PADDLE_ENFORCE_EQ(
1132 1133
        py_arr.owndata(),
        true,
1134 1135 1136 1137 1138
        platform::errors::InvalidArgument(
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));

    size_t copy_bytes = sizeof_dtype * numel;
1139
    auto p = tensor.place();
1140 1141 1142
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(tensor.place());
    paddle::memory::Copy(
1143 1144 1145 1146
        platform::CPUPlace(),
        py_arr.mutable_data(),
        p,
        tensor_buf_ptr,
1147 1148 1149
        copy_bytes,
        reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
    ctx.Wait();
1150 1151 1152 1153 1154
    return py_arr;
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
        "Please recompile or reinstall Paddle with MLU support."));
1155 1156 1157 1158
#endif
  } else if (is_custom_device_tensor) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
1159 1160
    PADDLE_ENFORCE_EQ(py_arr.writeable(),
                      true,
1161 1162 1163 1164
                      platform::errors::InvalidArgument(
                          "PyArray is not writable, in which case memory leak "
                          "or double free would occur"));
    PADDLE_ENFORCE_EQ(
1165 1166
        py_arr.owndata(),
        true,
1167 1168 1169 1170
        platform::errors::InvalidArgument(
            "PyArray does not own data, in which case  memory leak "
            "or double free would occur"));

1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
    // TODO(qili93): temporary for ascned npu performance to be removed along
    // with npu_identity op
    paddle::experimental::Tensor tensor_out(
        std::make_shared<phi::DenseTensor>());
    if (tensor.storage_properties_initialized()) {
      paddle::experimental::Tensor tensor_in(
          std::make_shared<phi::DenseTensor>(tensor));
      tensor_out = npu_identity_ad_func(tensor_in, -1);
      auto dense_tensor =
          std::dynamic_pointer_cast<phi::DenseTensor>(tensor_out.impl());
      tensor_buf_ptr = dense_tensor->data();
    }

1184 1185 1186 1187
    size_t copy_bytes = sizeof_dtype * numel;
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &ctx = *pool.Get(tensor.place());
    paddle::memory::Copy(
1188 1189 1190 1191 1192
        platform::CPUPlace(),
        py_arr.mutable_data(),
        tensor.place(),
        tensor_buf_ptr,
        copy_bytes,
1193 1194 1195 1196 1197 1198 1199 1200
        reinterpret_cast<const platform::CustomDeviceContext &>(ctx).stream());
    ctx.Wait();
    return py_arr;
#else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, "
        "Please recompile or reinstall Paddle with CustomPlace "
        "support."));
1201
#endif
1202 1203 1204
  }
  PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
  return py::array();
1205 1206
}

1207 1208
}  // namespace pybind
}  // namespace paddle