tensor_py.h 8.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14 15

#pragma once
Q
qijun 已提交
16
#include <string>
C
chengduoZH 已提交
17 18
#include <tuple>
#include <vector>
Y
Yi Wang 已提交
19 20 21
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
22
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
23 24
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
25 26 27 28 29

namespace py = pybind11;

namespace paddle {

30
namespace pybind {
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48

namespace details {

template <bool less, size_t I, typename... ARGS>
struct CastToPyBufferImpl;

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> {
  py::buffer_info operator()(framework::Tensor &tensor) {
    PADDLE_THROW("This type of tensor cannot be expose to Python");
    return py::buffer_info();
  }
};

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
Y
Yu Yang 已提交
49
    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
50 51 52 53 54 55 56 57 58 59 60 61
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
      std::vector<size_t> strides;
      dims_outside.resize(dim_vec.size());
      strides.resize(dim_vec.size());

      size_t prod = 1;
      for (size_t i = dim_vec.size(); i != 0; --i) {
        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
Q
qijun 已提交
62
      framework::Tensor dst_tensor;
Y
Yu Yang 已提交
63
      if (paddle::platform::is_gpu_place(tensor.place())) {
64 65 66 67
#ifdef PADDLE_WITH_CUDA
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
D
dzhwinter 已提交
68

Y
Yang Yu 已提交
69
        platform::DeviceContextPool &pool =
Y
Yang Yu 已提交
70
            platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
71
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
Y
Yang Yu 已提交
72
            pool.Get(tensor.place()));
D
dzhwinter 已提交
73 74 75 76

        paddle::platform::GpuMemcpyAsync(
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
            cudaMemcpyDeviceToHost, dev_ctx->stream());
77
        dev_ctx->Wait();
78
#else
D
dzhwinter 已提交
79
        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
80
#endif
Y
Yu Yang 已提交
81
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
Q
qijun 已提交
82 83
        dst_tensor = tensor;
      }
84 85 86 87 88 89 90 91 92 93 94 95 96

      if (std::type_index(typeid(CUR_TYPE)) ==
          std::type_index(typeid(platform::float16))) {
        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
                               "e", /* np.dtype('e') == np.float16 */
                               (size_t)framework::arity(dst_tensor.dims()),
                               dims_outside, strides);
      } else {
        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
                               py::format_descriptor<CUR_TYPE>::format(),
                               (size_t)framework::arity(dst_tensor.dims()),
                               dims_outside, strides);
      }
97 98 99 100 101 102
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
    }
  }
};
103

104
}  // namespace details
105

106
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
107
  auto buffer_info =
108 109
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                  platform::float16>()(tensor);
110 111 112
  return buffer_info;
}

113 114
template <typename T>
T TensorGetElement(framework::Tensor &self, size_t offset) {
115 116 117 118
  if (platform::is_cpu_place(self.place())) {
    return self.data<T>()[offset];
  } else {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
Y
Yi Wang 已提交
119
    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
120 121
    return dst->data<T>()[offset];
  }
122 123
}

124
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
125 126
template <typename T>
void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
127 128
  if (platform::is_gpu_place(self.place())) {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
Y
Yi Wang 已提交
129
    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
130
    dst->data<T>()[offset] = elem;
Y
Yi Wang 已提交
131
    framework::TensorCopy(*dst.get(), self.place(), &self);
132 133 134 135

  } else if (platform::is_cpu_place(self.place())) {
    self.data<T>()[offset] = elem;
  }
136 137
}

138
template <typename T>
Q
qijun 已提交
139
void PyCPUTensorSetFromArray(
140
    framework::Tensor &self,
Q
qijun 已提交
141 142
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
Q
qijun 已提交
143
  std::vector<int64_t> dims;
144 145
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
146
    dims.push_back(static_cast<int>(array.shape()[i]));
147 148
  }

F
fengjiayi 已提交
149
  self.Resize(framework::make_ddim(dims));
Q
qijun 已提交
150
  auto *dst = self.mutable_data<T>(place);
151 152 153
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

154
template <>
C
chengduoZH 已提交
155 156
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
157 158 159 160 161 162 163
void PyCPUTensorSetFromArray(
    framework::Tensor &self,
    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
164
    dims.push_back(static_cast<int>(array.shape()[i]));
165 166 167 168 169 170 171
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<platform::float16>(place);
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}

172
#ifdef PADDLE_WITH_CUDA
Q
qijun 已提交
173 174 175 176
template <typename T>
void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
D
dzhwinter 已提交
177
    paddle::platform::CUDAPlace &place) {
Q
qijun 已提交
178
  std::vector<int64_t> dims;
Q
qijun 已提交
179 180
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
181
    dims.push_back(static_cast<int>(array.shape()[i]));
Q
qijun 已提交
182
  }
Q
qijun 已提交
183 184 185

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
D
dzhwinter 已提交
186

Y
Yang Yu 已提交
187
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
188
  auto dev_ctx =
Y
Yang Yu 已提交
189
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
D
dzhwinter 已提交
190 191
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
192
}
193 194

template <>
C
chengduoZH 已提交
195 196
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
197 198 199 200 201 202 203
void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CUDAPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
C
chengduoZH 已提交
204
    dims.push_back(static_cast<int>(array.shape()[i]));
205 206 207 208 209 210 211 212 213 214 215 216
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<platform::float16>(place);

  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                   sizeof(uint16_t) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
}
C
chengduoZH 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

template <typename T>
void PyCUDAPinnedTensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

template <>
C
chengduoZH 已提交
235 236
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
C
chengduoZH 已提交
237 238 239 240 241 242 243 244 245 246 247 248 249 250
void PyCUDAPinnedTensorSetFromArray(
    framework::Tensor &self,
    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<platform::float16>(place);
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
Q
qijun 已提交
251
#endif
252 253 254

}  // namespace pybind
}  // namespace paddle