tensor_py.h 7.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14 15

#pragma once
Q
qijun 已提交
16
#include <string>
Y
Yi Wang 已提交
17 18 19
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
20
#include "paddle/fluid/platform/float16.h"
Q
qijun 已提交
21 22
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
23 24 25 26 27

namespace py = pybind11;

namespace paddle {

28
namespace pybind {
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

namespace details {

template <bool less, size_t I, typename... ARGS>
struct CastToPyBufferImpl;

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> {
  py::buffer_info operator()(framework::Tensor &tensor) {
    PADDLE_THROW("This type of tensor cannot be expose to Python");
    return py::buffer_info();
  }
};

template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
  py::buffer_info operator()(framework::Tensor &tensor) {
Y
Yu Yang 已提交
47
    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
48 49 50 51 52 53 54 55 56 57 58 59
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
      std::vector<size_t> strides;
      dims_outside.resize(dim_vec.size());
      strides.resize(dim_vec.size());

      size_t prod = 1;
      for (size_t i = dim_vec.size(); i != 0; --i) {
        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
        strides[i - 1] = sizeof(CUR_TYPE) * prod;
        prod *= dims_outside[i - 1];
      }
Q
qijun 已提交
60
      framework::Tensor dst_tensor;
Y
Yu Yang 已提交
61
      if (paddle::platform::is_gpu_place(tensor.place())) {
62 63 64 65
#ifdef PADDLE_WITH_CUDA
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
D
dzhwinter 已提交
66

Y
Yang Yu 已提交
67
        platform::DeviceContextPool &pool =
Y
Yang Yu 已提交
68
            platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
69
        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
Y
Yang Yu 已提交
70
            pool.Get(tensor.place()));
D
dzhwinter 已提交
71 72 73 74

        paddle::platform::GpuMemcpyAsync(
            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
            cudaMemcpyDeviceToHost, dev_ctx->stream());
75
        dev_ctx->Wait();
76
#else
D
dzhwinter 已提交
77
        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
78
#endif
Y
Yu Yang 已提交
79
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
Q
qijun 已提交
80 81
        dst_tensor = tensor;
      }
82 83 84 85 86 87 88 89 90 91 92 93 94

      if (std::type_index(typeid(CUR_TYPE)) ==
          std::type_index(typeid(platform::float16))) {
        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
                               "e", /* np.dtype('e') == np.float16 */
                               (size_t)framework::arity(dst_tensor.dims()),
                               dims_outside, strides);
      } else {
        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
                               py::format_descriptor<CUR_TYPE>::format(),
                               (size_t)framework::arity(dst_tensor.dims()),
                               dims_outside, strides);
      }
95 96 97 98 99 100
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
    }
  }
};
101

102
}  // namespace details
103

104
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
105
  auto buffer_info =
106 107
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                  platform::float16>()(tensor);
108 109 110
  return buffer_info;
}

111 112
template <typename T>
T TensorGetElement(framework::Tensor &self, size_t offset) {
113 114 115 116
  if (platform::is_cpu_place(self.place())) {
    return self.data<T>()[offset];
  } else {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
Y
Yi Wang 已提交
117
    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
118 119
    return dst->data<T>()[offset];
  }
120 121
}

122
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
123 124
template <typename T>
void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
125 126
  if (platform::is_gpu_place(self.place())) {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
Y
Yi Wang 已提交
127
    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
128
    dst->data<T>()[offset] = elem;
Y
Yi Wang 已提交
129
    framework::TensorCopy(*dst.get(), self.place(), &self);
130 131 132 133

  } else if (platform::is_cpu_place(self.place())) {
    self.data<T>()[offset] = elem;
  }
134 135
}

136
template <typename T>
Q
qijun 已提交
137
void PyCPUTensorSetFromArray(
138
    framework::Tensor &self,
Q
qijun 已提交
139 140
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
Q
qijun 已提交
141
  std::vector<int64_t> dims;
142 143 144 145 146
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
  }

F
fengjiayi 已提交
147
  self.Resize(framework::make_ddim(dims));
Q
qijun 已提交
148
  auto *dst = self.mutable_data<T>(place);
149 150 151
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
}

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
template <>
void PyCPUTensorSetFromArray(
    framework::Tensor &self,
    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CPUPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<platform::float16>(place);
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}

168
#ifdef PADDLE_WITH_CUDA
Q
qijun 已提交
169 170 171 172
template <typename T>
void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<T, py::array::c_style | py::array::forcecast> array,
D
dzhwinter 已提交
173
    paddle::platform::CUDAPlace &place) {
Q
qijun 已提交
174
  std::vector<int64_t> dims;
Q
qijun 已提交
175 176 177
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
Q
qijun 已提交
178
  }
Q
qijun 已提交
179 180 181

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
D
dzhwinter 已提交
182

Y
Yang Yu 已提交
183
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
D
dzhwinter 已提交
184
  auto dev_ctx =
Y
Yang Yu 已提交
185
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
D
dzhwinter 已提交
186 187
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
188
}
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210

template <>
void PyCUDATensorSetFromArray(
    framework::Tensor &self,
    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
    paddle::platform::CUDAPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
    dims.push_back((int)array.shape()[i]);
  }

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<platform::float16>(place);

  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
  paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                   sizeof(uint16_t) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
}
Q
qijun 已提交
211
#endif
212 213 214

}  // namespace pybind
}  // namespace paddle