tensor_py.h 9.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14 15

#pragma once
L
Luo Tao 已提交
16
#include <Python.h>
Q
qijun 已提交
17
#include <string>
C
chengduoZH 已提交
18 19
#include <tuple>
#include <vector>
Y
Yi Wang 已提交
20 21 22
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
23
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
24 25
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
26 27

namespace paddle {
28
namespace pybind {
29 30 31 32 33 34 35
namespace details {

template <bool less, size_t I, typename... ARGS>
struct CastToPyBufferImpl;

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> {
36
  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
37
    PADDLE_THROW("This type of tensor cannot be expose to Python");
38
    return pybind11::buffer_info();
39 40 41 42 43 44
  }
};

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
45
  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
Y
Yu Yang 已提交
46
    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
47 48 49 50 51 52 53 54 55 56 57 58
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
      std::vector<size_t> strides;
      dims_outside.resize(dim_vec.size());
      strides.resize(dim_vec.size());

      size_t prod = 1;
      for (size_t i = dim_vec.size(); i != 0; --i) {
        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
Q
qijun 已提交
59
      framework::Tensor dst_tensor;
Y
Yu Yang 已提交
60
      if (paddle::platform::is_gpu_place(tensor.place())) {
61 62 63 64
#ifdef PADDLE_WITH_CUDA
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
D
dzhwinter 已提交
65

Y
Yang Yu 已提交
66
        platform::DeviceContextPool &pool =
Y
Yang Yu 已提交
67
            platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
68
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
Y
Yang Yu 已提交
69
            pool.Get(tensor.place()));
D
dzhwinter 已提交
70 71 72 73

        paddle::platform::GpuMemcpyAsync(
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
            cudaMemcpyDeviceToHost, dev_ctx->stream());
74
        dev_ctx->Wait();
75
#else
D
dzhwinter 已提交
76
        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
77
#endif
Y
Yu Yang 已提交
78
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
Q
qijun 已提交
79 80
        dst_tensor = tensor;
      }
81 82 83

      if (std::type_index(typeid(CUR_TYPE)) ==
          std::type_index(typeid(platform::float16))) {
84 85 86 87
        return pybind11::buffer_info(
            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
            "e", /* np.dtype('e') == np.float16 */
            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
88
      } else {
89 90 91 92
        return pybind11::buffer_info(
            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
            pybind11::format_descriptor<CUR_TYPE>::format(),
            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
93
      }
94 95 96 97 98 99
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
    }
  }
};
100

101
}  // namespace details
102

103
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
104
  auto buffer_info =
105 106
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                  platform::float16>()(tensor);
107 108 109
  return buffer_info;
}

110
template <typename T>
111
T TensorGetElement(const framework::Tensor &self, size_t offset) {
112 113 114 115
  if (platform::is_cpu_place(self.place())) {
    return self.data<T>()[offset];
  } else {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
Y
Yi Wang 已提交
116
    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
117 118
    return dst->data<T>()[offset];
  }
119 120
}

121
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
122
template <typename T>
123 124
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
  if (platform::is_gpu_place(self->place())) {
125
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
126
    framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
127
    dst->data<T>()[offset] = elem;
128
    framework::TensorCopy(*dst.get(), self->place(), self);
129

130 131
  } else if (platform::is_cpu_place(self->place())) {
    self->data<T>()[offset] = elem;
132
  }
133 134
}

135
template <typename T>
Q
qijun 已提交
136
void PyCPUTensorSetFromArray(
137 138 139 140
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CPUPlace place) {
Q
qijun 已提交
141
  std::vector<int64_t> dims;
142 143
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
144
    dims.push_back(static_cast<int>(array.shape()[i]));
145 146
  }

147 148
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
149 150 151
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

152
template <>
C
chengduoZH 已提交
153 154
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
155
void PyCPUTensorSetFromArray(
156 157 158 159 160
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CPUPlace place) {
161 162 163
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
164
    dims.push_back(static_cast<int>(array.shape()[i]));
165 166
  }

167 168
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
169 170 171
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}

172
#ifdef PADDLE_WITH_CUDA
Q
qijun 已提交
173 174
template <typename T>
void PyCUDATensorSetFromArray(
175 176 177 178
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CUDAPlace place) {
Q
qijun 已提交
179
  std::vector<int64_t> dims;
Q
qijun 已提交
180 181
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
182
    dims.push_back(static_cast<int>(array.shape()[i]));
Q
qijun 已提交
183
  }
Q
qijun 已提交
184

185 186
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
D
dzhwinter 已提交
187

Y
Yang Yu 已提交
188
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
189
  auto dev_ctx =
Y
Yang Yu 已提交
190
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
D
dzhwinter 已提交
191 192
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
193
}
194 195

template <>
C
chengduoZH 已提交
196 197
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
198
void PyCUDATensorSetFromArray(
199 200 201 202 203
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
    paddle::platform::CUDAPlace place) {
204 205 206
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
207
    dims.push_back(static_cast<int>(array.shape()[i]));
208 209
  }

210 211
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
212 213 214 215 216 217 218 219

  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                   sizeof(uint16_t) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
}
C
chengduoZH 已提交
220 221 222

template <typename T>
void PyCUDAPinnedTensorSetFromArray(
223 224 225
    framework::Tensor *self,
    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
        array,
C
chengduoZH 已提交
226 227 228 229 230 231 232
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

233 234
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<T>(place);
C
chengduoZH 已提交
235 236 237 238
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

template <>
C
chengduoZH 已提交
239 240
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
C
chengduoZH 已提交
241
void PyCUDAPinnedTensorSetFromArray(
242 243 244 245
    framework::Tensor *self,
    pybind11::array_t<uint16_t,
                      pybind11::array::c_style | pybind11::array::forcecast>
        array,
C
chengduoZH 已提交
246 247 248 249 250 251 252
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

253 254
  self->Resize(framework::make_ddim(dims));
  auto *dst = self->mutable_data<platform::float16>(place);
C
chengduoZH 已提交
255 256
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
Q
qijun 已提交
257
#endif
258 259 260

}  // namespace pybind
}  // namespace paddle