tensor_py.h 9.9 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14 15

#pragma once
L
Luo Tao 已提交
16
#include <Python.h>
Y
Yu Yang 已提交
17
#include <cmake-build-release/third_party/pybind/src/extern_pybind/include/pybind11/common.h>
Q
qijun 已提交
18
#include <string>
C
chengduoZH 已提交
19 20
#include <tuple>
#include <vector>
Y
Yi Wang 已提交
21 22 23
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
24
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
25 26
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
27 28

namespace paddle {
29
namespace pybind {
30 31 32 33 34 35 36
namespace details {

template <bool less, size_t I, typename... ARGS>
struct CastToPyBufferImpl;

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> {
37
  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
38
    PADDLE_THROW("This type of tensor cannot be expose to Python");
39
    return pybind11::buffer_info();
40 41 42 43 44 45
  }
};

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
46
  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
Y
Yu Yang 已提交
47
    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
48 49 50 51 52 53 54 55 56 57 58 59
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
      std::vector<size_t> strides;
      dims_outside.resize(dim_vec.size());
      strides.resize(dim_vec.size());

      size_t prod = 1;
      for (size_t i = dim_vec.size(); i != 0; --i) {
        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
Q
qijun 已提交
60
      framework::Tensor dst_tensor;
Y
Yu Yang 已提交
61 62
      bool is_gpu = paddle::platform::is_gpu_place(tensor.place());
      if (is_gpu) {
63 64 65
#ifdef PADDLE_WITH_CUDA
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
Y
Refine  
Yu Yang 已提交
66 67
            tensor.dims(), platform::CPUPlace(),
            memory::Allocator::kCommunication));
D
dzhwinter 已提交
68

Y
Yu Yang 已提交
69 70 71
        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
                                        sizeof(CUR_TYPE) * tensor.numel(),
                                        cudaMemcpyDeviceToHost);
72
#else
D
dzhwinter 已提交
73
        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
74
#endif
Y
Yu Yang 已提交
75
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
Q
qijun 已提交
76 77
        dst_tensor = tensor;
      }
78

Y
Yu Yang 已提交
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
      std::string dtype = std::type_index(typeid(CUR_TYPE)) ==
                                  std::type_index(typeid(platform::float16))
                              ? std::string("e")  // np.dtype('e') == np.float16
                              : pybind11::format_descriptor<CUR_TYPE>::format();

      if (is_gpu) {
        // manually construct a py_buffer if is_gpu since gpu data is copied
        // into CPU.
        // TODO(yy): Is these following code memleak?
        Py_buffer *py_buffer =
            reinterpret_cast<Py_buffer *>(malloc(sizeof(Py_buffer)));
        py_buffer->format = strdup(dtype.c_str());
        py_buffer->itemsize = sizeof(CUR_TYPE);
        py_buffer->ndim = framework::arity(dst_tensor.dims());
        py_buffer->len = tensor.numel();
        py_buffer->strides = reinterpret_cast<Py_ssize_t *>(
            malloc(sizeof(Py_ssize_t) * strides.size()));
        for (size_t i = 0; i < strides.size(); ++i) {
          py_buffer->strides[i] = strides[i];
        }

        py_buffer->shape = reinterpret_cast<Py_ssize_t *>(
            malloc(sizeof(Py_ssize_t) * tensor.dims().size()));
        for (size_t i = 0; i < tensor.dims().size(); ++i) {
          py_buffer->shape[i] = tensor.dims()[i];
        }

        py_buffer->readonly = false;
        py_buffer->suboffsets = nullptr;
        py_buffer->obj = nullptr;
        py_buffer->buf =
            malloc(static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
        memcpy(py_buffer->buf, dst_tensor.data<CUR_TYPE>(),
               static_cast<size_t>(py_buffer->len * py_buffer->itemsize));
        return pybind11::buffer_info(py_buffer, true);
114
      } else {
115
        return pybind11::buffer_info(
Y
Yu Yang 已提交
116
            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), dtype,
117
            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
118
      }
119 120 121 122 123 124
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
    }
  }
};
125

126
}  // namespace details
127

128
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
129
  auto buffer_info =
130
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
Q
qingqing01 已提交
131
                                  uint8_t, int8_t, platform::float16>()(tensor);
132 133 134
  return buffer_info;
}

135
template <typename T>
136
T TensorGetElement(const framework::Tensor &self, size_t offset) {
137 138 139 140
  if (platform::is_cpu_place(self.place())) {
    return self.data<T>()[offset];
  } else {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
F
fix  
fengjiayi 已提交
141
    framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
142 143
    return dst->data<T>()[offset];
  }
144 145
}

Y
Yu Yang 已提交
146
// TODO(dzhwinter) : fix the redundant Tensor allocate and free
147
template <typename T>
148 149
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
  if (platform::is_gpu_place(self->place())) {
Y
Yu Yang 已提交
150 151 152 153
    framework::Tensor dst;
    framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
    dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
    framework::TensorCopySync(dst, self->place(), self);
154
  } else if (platform::is_cpu_place(self->place())) {
Y
Yu Yang 已提交
155
    self->mutable_data<T>(self->place())[offset] = elem;
156
  }
157 158
}

159
template <typename T>
Q
qijun 已提交
160
void PyCPUTensorSetFromArray(
161 162 163 164
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CPUPlace place) {
Q
qijun 已提交
165
  std::vector<int64_t> dims;
166 167
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
168
    dims.push_back(static_cast<int>(array.shape()[i]));
169 170
  }

171 172
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
173 174 175
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

176
template <>
C
chengduoZH 已提交
177 178
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
S
sneaxiy 已提交
179
inline void PyCPUTensorSetFromArray(
180 181 182 183 184
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CPUPlace place) {
185 186 187
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
188
    dims.push_back(static_cast<int>(array.shape()[i]));
189 190
  }

191 192
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
193 194 195
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}

196
#ifdef PADDLE_WITH_CUDA
Q
qijun 已提交
197 198
template <typename T>
void PyCUDATensorSetFromArray(
199 200 201 202
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CUDAPlace place) {
Q
qijun 已提交
203
  std::vector<int64_t> dims;
Q
qijun 已提交
204 205
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
206
    dims.push_back(static_cast<int>(array.shape()[i]));
Q
qijun 已提交
207
  }
Q
qijun 已提交
208

209 210
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
Y
Yu Yang 已提交
211 212
  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
                                  cudaMemcpyHostToDevice);
213
}
214 215

template <>
C
chengduoZH 已提交
216 217
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
S
sneaxiy 已提交
218
inline void PyCUDATensorSetFromArray(
219 220 221 222 223
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CUDAPlace place) {
224 225 226
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
227
    dims.push_back(static_cast<int>(array.shape()[i]));
228 229
  }

230 231
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
Y
Yu Yang 已提交
232 233 234
  paddle::platform::GpuMemcpySync(dst, array.data(),
                                  sizeof(uint16_t) * array.size(),
                                  cudaMemcpyHostToDevice);
235
}
C
chengduoZH 已提交
236 237 238

template <typename T>
void PyCUDAPinnedTensorSetFromArray(
239 240 241
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
C
chengduoZH 已提交
242 243 244 245 246 247 248
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

249 250
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
C
chengduoZH 已提交
251 252 253 254
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

template <>
C
chengduoZH 已提交
255 256
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
S
sneaxiy 已提交
257
inline void PyCUDAPinnedTensorSetFromArray(
258 259 260 261
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
C
chengduoZH 已提交
262 263 264 265 266 267 268
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

269 270
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
C
chengduoZH 已提交
271 272
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
Q
qijun 已提交
273
#endif
274 275 276

}  // namespace pybind
}  // namespace paddle