未验证 提交 cfffb1a3 编写于 作者: Y Yi Wang 提交者: GitHub

Update tensor_util.h (#8422)

* Update tensor_util.h

* Update with moved TensorDesc

* Fix tensur_utils.cu

* Update

* Update

* Update

* Update

* Make tensor_util.cu a symbolic link
上级 cbc72a70
......@@ -37,7 +37,7 @@ void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
<< " dst_place: " << dst_place;
auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
dev_ctx->Wait();
Copy(in, dst_place, *dev_ctx, out);
TensorCopy(in, dst_place, *dev_ctx, out);
dev_ctx->Wait();
}
......
......@@ -157,8 +157,8 @@ TEST(Operator, CPUtoGPU) {
auto dev_ctx = pool.Get(cuda_place);
paddle::framework::Tensor output_tensor;
Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
dev_ctx->Wait();
float* output2_ptr = output_tensor.data<float>();
......
......@@ -73,8 +73,10 @@ static void CheckTensorNANOrInf(const std::string& name,
tensor.type().hash_code() != typeid(double).hash_code()) {
return;
}
PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
"Tensor %s contains Inf", name);
PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
"Tensor %s contains NAN", name);
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
......
......@@ -46,7 +46,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
if (!platform::is_cpu_place(t.place())) {
LoDTensor tt;
framework::Copy(t, platform::CPUPlace(), &tt);
framework::TensorCopy(t, platform::CPUPlace(), &tt);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(t.place());
dev_ctx.Wait();
......@@ -255,7 +255,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
}
}
// the 3st field, Tensor
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
......@@ -282,7 +282,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
}
}
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
......@@ -308,14 +308,14 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
if (lod().empty()) {
auto src = Slice(begin, end);
auto &dst_place = places[i];
framework::Copy(src, dst_place, &dst);
framework::TensorCopy(src, dst_place, &dst);
} else {
auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
auto &offset = lod_and_offset.second;
auto src = Slice(offset.first, offset.second);
auto &dst_place = places[i];
framework::Copy(src, dst_place, &dst);
framework::TensorCopy(src, dst_place, &dst);
LoD my_lod;
for (auto &l : lod_and_offset.first) {
......@@ -369,7 +369,7 @@ void LoDTensor::MergeLoDTensor(
for (auto *src : lod_tensors) {
int end = begin + src->dims()[0];
auto dst = Slice(begin, end);
framework::Copy(*src, dst_place, &dst);
framework::TensorCopy(*src, dst_place, &dst);
begin = end;
}
}
......
......@@ -175,8 +175,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
for (size_t ins = 0; ins < num_instances; ins++) {
for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
auto slice = tensor.Slice(elem, elem + 1);
Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
platform::CPUDeviceContext(), &slice);
TensorCopy(source.Slice(ins, ins + 1), platform::CPUPlace(),
platform::CPUDeviceContext(), &slice);
}
}
return tensor;
......
......@@ -291,7 +291,7 @@ class Vector {
void CopyToCPU() const {
// COPY GPU Data To CPU
Copy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
......@@ -305,13 +305,14 @@ class Vector {
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
......@@ -322,13 +323,14 @@ class Vector {
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
Copy(cpu_vec_, boost::get<platform::CUDAPlace>(place), &cuda_vec_);
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
Copy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
......
......@@ -105,7 +105,7 @@ void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
}
}
Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
Copy(buffer_[i][j], platform::CPUPlace(), &dst);
TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
dst_offset += ins_shape[0];
}
out_tensor.set_lod(batch_lod);
......
......@@ -34,7 +34,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
os.write(reinterpret_cast<const char*>(&height), sizeof(height));
}
// the 4st field, Tensor data
SerializeToStream(os, selected_rows.value(), dev_ctx);
TensorToStream(os, selected_rows.value(), dev_ctx);
}
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
......@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
selected_rows->set_height(height);
}
// the 4st field, tensor which contains the data
DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
}
} // namespace framework
......
......@@ -16,6 +16,76 @@
namespace paddle {
namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
}
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) {
dev_ctx = pool.Get(src.place());
} else {
dev_ctx = pool.Get(dst_place);
}
TensorCopy(src, dst_place, *dev_ctx, dst);
}
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
......@@ -69,7 +139,7 @@ struct AnyVisitor : public boost::static_visitor<bool> {
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
Copy(out, cpu, *gpuctx, &tmp);
TensorCopy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
......@@ -87,7 +157,7 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
return platform::VisitPlace(place, visitor);
}
struct HasNANPredicate {
struct ContainsNANPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
......@@ -96,12 +166,12 @@ struct HasNANPredicate {
}
};
bool HasNAN(const framework::Tensor& tensor) {
HasNANPredicate predicate;
bool TensorContainsNAN(const framework::Tensor& tensor) {
ContainsNANPredicate predicate;
return Any(tensor, predicate);
}
struct HasInfPredicate {
struct ContainsInfPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
......@@ -110,10 +180,124 @@ struct HasInfPredicate {
}
};
bool HasInf(const framework::Tensor& tensor) {
HasInfPredicate predicate;
bool TensorContainsInf(const framework::Tensor& tensor) {
ContainsInfPredicate predicate;
return Any(tensor, predicate);
}
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::VarType::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto* pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
static_cast<std::streamsize>(size));
}
}
}
struct DeserializedDataFunctor {
DeserializedDataFunctor(void** buf, Tensor* tensor,
const platform::Place& place)
: buf_(buf), tensor_(tensor), place_(place) {}
template <typename T>
void operator()() {
*buf_ = tensor_->mutable_data<T>(place_);
}
void** buf_;
Tensor* tensor_;
platform::Place place_;
};
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx) {
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::VarType::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), tensor->memory_size());
}
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace framework {
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
const Tensor& tensor_;
const DevCtx& ctx_;
Tensor* out_;
AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
Tensor* out)
: predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
template <typename T>
void operator()() const {
auto t = EigenVector<T>::Flatten(tensor_);
auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
o.device(*ctx_.eigen_device()) = predicate_(t).any();
}
};
template <typename Predicate, typename DevCtx>
inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
const DevCtx& ctx, framework::Tensor* out) {
VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
predicate, tensor, ctx, out));
}
template <typename Predicate>
struct AnyVisitor : public boost::static_visitor<bool> {
const framework::Tensor& tensor_;
Predicate predicate_;
AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
template <typename Place>
bool operator()(const Place& place) const {
framework::Tensor out;
out.Resize({1});
out.mutable_data<bool>(place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
AnyImpl(predicate_, tensor_, *ctx, &out);
return this->GetResult(out, place);
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPlace& gpu) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
Copy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
}
};
template <typename Predicate>
inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
AnyVisitor<Predicate> visitor(tensor, predicate);
auto place = tensor.place();
return platform::VisitPlace(place, visitor);
}
struct HasNANPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isnan();
}
};
bool HasNAN(const framework::Tensor& tensor) {
HasNANPredicate predicate;
return Any(tensor, predicate);
}
struct HasInfPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isinf();
}
};
bool HasInf(const framework::Tensor& tensor) {
HasInfPredicate predicate;
return Any(tensor, predicate);
}
} // namespace framework
} // namespace paddle
tensor_util.cc
\ No newline at end of file
......@@ -22,106 +22,38 @@ limitations under the License. */
namespace paddle {
namespace framework {
/**
* @brief Copy the content of external tensor to a new place.
*
* @param[in] src The external tensor.
* @param[in] dst_place The dst place.
* @param[in] ctx The device context contains device resources.
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline void Copy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst);
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst);
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
template <typename T>
void TensorFromVector(const std::vector<T>& src,
const platform::DeviceContext& ctx, Tensor* dst);
template <typename T>
void TensorFromVector(const std::vector<T>& src, Tensor* dst);
auto dst_ptr = dst->mutable_data(dst_place, src.type());
template <typename T>
void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
std::vector<T>* dst);
template <typename T>
void TesnorToVector(const Tensor& src, std::vector<T>* dst);
auto size = src.numel() * SizeOfType(src.type());
bool TensorContainsNAN(const framework::Tensor& tensor);
bool TensorContainsInf(const framework::Tensor& tensor);
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
memory::Copy(
dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif
}
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx);
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx);
/**
* @brief Wrapper on
* Copy(const Tensor& src, const platform::Place& dst_place,
* const platform::DeviceContext& ctx, Tensor* dst);
*
* @param[in] src The external tensor.
* @param[in] dst_place The dst place.
*
* @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
inline void Copy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(src.place())) {
dev_ctx = pool.Get(src.place());
} else {
dev_ctx = pool.Get(dst_place);
}
Copy(src, dst_place, *dev_ctx, dst);
}
//
// The implementation of template functions.
//
/**
* @brief Copy the content of an external vector to a tensor.
*
* @param[in] src The external tensor.
* @param[in] ctx The device context contains device resources.
*
* * @note CopyFromVector will resize dst to an 1D tensor with the same
* size as src.
*/
template <typename T>
inline void CopyFromVector(const std::vector<T>& src,
const platform::DeviceContext& ctx, Tensor* dst) {
void TensorFromVector(const std::vector<T>& src,
const platform::DeviceContext& ctx, Tensor* dst) {
auto dst_place = ctx.GetPlace();
auto src_ptr = static_cast<const void*>(src.data());
platform::CPUPlace src_place;
......@@ -143,11 +75,8 @@ inline void CopyFromVector(const std::vector<T>& src,
#endif
}
/**
* @brief CopyFromVector CPU vector -> CPU Tensor
*/
template <typename T>
inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
platform::CPUPlace dst_place = platform::CPUPlace();
auto src_ptr = static_cast<const void*>(src.data());
platform::CPUPlace src_place;
......@@ -158,18 +87,9 @@ inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
/**
* @brief Copy the content of a tensor to a vector
*
* @param[in] src The external tensor.
* @param[in] ctx The device context contains device resources.
*
* * @note CopyFromVector assumes that the tensor has been resized
* before invoking.
*/
template <typename T>
inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
std::vector<T>* dst) {
void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
std::vector<T>* dst) {
auto src_ptr = static_cast<const void*>(src.data<T>());
auto size = src.numel() * sizeof(T);
......@@ -191,11 +111,8 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
#endif
}
/**
* @brief CopyToVector CPUTensor <-> CPU Vector
*/
template <typename T>
inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
void TensorToVector(const Tensor& src, std::vector<T>* dst) {
auto src_ptr = static_cast<const void*>(src.data<T>());
auto size = src.numel() * sizeof(T);
......@@ -209,125 +126,5 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size);
}
// Returns true if a tensor contains NAN, i.e., Not A Number.
bool HasNAN(const framework::Tensor& tensor);
// Returns true if a tensor contains Inf, i.e., Infinity.
bool HasInf(const framework::Tensor& tensor);
inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::VarType::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto* pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
static_cast<std::streamsize>(size));
}
}
}
struct DeserializedDataFunctor {
DeserializedDataFunctor(void** buf, Tensor* tensor,
const platform::Place& place)
: buf_(buf), tensor_(tensor), place_(place) {}
template <typename T>
void operator()() {
*buf_ = tensor_->mutable_data<T>(place_);
}
void** buf_;
Tensor* tensor_;
platform::Place place_;
};
inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx) {
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::VarType::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
auto dst_place = dev_ctx.GetPlace();
framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), tensor->memory_size());
}
}
}
} // namespace framework
} // namespace paddle
......@@ -20,7 +20,7 @@
namespace paddle {
namespace framework {
TEST(Copy, Tensor) {
TEST(TensorCopy, Tensor) {
Tensor src_tensor;
Tensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
......@@ -33,7 +33,7 @@ TEST(Copy, Tensor) {
src_tensor.set_layout(DataLayout::kAnyLayout);
auto cpu_place = new platform::CPUPlace();
Copy(src_tensor, *cpu_place, &dst_tensor);
TensorCopy(src_tensor, *cpu_place, &dst_tensor);
const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr);
......@@ -44,7 +44,7 @@ TEST(Copy, Tensor) {
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2);
Copy(slice_tensor, *cpu_place, &dst_tensor);
TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr);
......@@ -68,11 +68,11 @@ TEST(Copy, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
auto cpu_place = new platform::CPUPlace();
Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
......@@ -85,10 +85,10 @@ TEST(Copy, Tensor) {
Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
TensorCopy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
......@@ -104,7 +104,7 @@ TEST(Copy, Tensor) {
#endif
}
TEST(CopyFromVector, Tensor) {
TEST(TensorFromVector, Tensor) {
using namespace paddle::framework;
using namespace paddle::platform;
{
......@@ -114,7 +114,7 @@ TEST(CopyFromVector, Tensor) {
// Copy to CPU Tensor
cpu_tensor.Resize(make_ddim({3, 3}));
auto cpu_place = new paddle::platform::CPUPlace();
CopyFromVector<int>(src_vec, &cpu_tensor);
TensorFromVector<int>(src_vec, &cpu_tensor);
// Compare Tensors
const int* cpu_ptr = cpu_tensor.data<int>();
......@@ -126,7 +126,7 @@ TEST(CopyFromVector, Tensor) {
src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
cpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector<int>(src_vec, &cpu_tensor);
TensorFromVector<int>(src_vec, &cpu_tensor);
cpu_ptr = cpu_tensor.data<int>();
src_ptr = src_vec.data();
ASSERT_NE(src_ptr, cpu_ptr);
......@@ -148,15 +148,15 @@ TEST(CopyFromVector, Tensor) {
cpu_tensor.Resize(make_ddim({3, 3}));
auto cpu_place = new paddle::platform::CPUPlace();
CPUDeviceContext cpu_ctx(*cpu_place);
CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
// Copy to GPUTensor
gpu_tensor.Resize(make_ddim({3, 3}));
auto gpu_place = new paddle::platform::CUDAPlace();
CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
......@@ -173,10 +173,10 @@ TEST(CopyFromVector, Tensor) {
src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
cpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
gpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
......@@ -196,7 +196,7 @@ TEST(CopyFromVector, Tensor) {
#endif
}
TEST(CopyToVector, Tensor) {
TEST(TensorToVector, Tensor) {
using namespace paddle::framework;
using namespace paddle::platform;
{
......@@ -208,7 +208,7 @@ TEST(CopyToVector, Tensor) {
CPUPlace place;
std::vector<int> dst;
CopyToVector<int>(src, &dst);
TensorToVector<int>(src, &dst);
for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_ptr[i], dst[i]);
......@@ -220,10 +220,10 @@ TEST(CopyToVector, Tensor) {
Tensor gpu_tensor;
CUDAPlace place;
CUDADeviceContext gpu_ctx(place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
std::vector<int> dst;
CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
for (int i = 0; i < 3 * 3; ++i) {
EXPECT_EQ(src_vec[i], dst[i]);
......@@ -232,7 +232,7 @@ TEST(CopyToVector, Tensor) {
#endif
}
TEST(HasNAN, CPU) {
TEST(TensorContainsNAN, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
......@@ -240,11 +240,12 @@ TEST(HasNAN, CPU) {
buf[0] = 0.0;
buf[1] = NAN;
buf[2] = 0.0;
ASSERT_TRUE(HasNAN(src));
ASSERT_TRUE(TensorContainsNAN(src));
buf[1] = 0.0;
ASSERT_FALSE(TensorContainsNAN(src));
}
TEST(HasInf, CPU) {
TEST(TensorContainsInf, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
......@@ -252,10 +253,12 @@ TEST(HasInf, CPU) {
buf[0] = 1.0;
buf[1] = INFINITY;
buf[2] = 0.0;
ASSERT_TRUE(HasInf(src));
ASSERT_TRUE(TensorContainsInf(src));
buf[1] = 1.0;
ASSERT_FALSE(TensorContainsInf(src));
}
TEST(Tensor, SerializeAndDeserialize) {
TEST(Tensor, FromAndToStream) {
framework::Tensor src_tensor;
int array[6] = {1, 2, 3, 4, 5, 6};
src_tensor.Resize({2, 3});
......@@ -268,10 +271,10 @@ TEST(Tensor, SerializeAndDeserialize) {
auto place = new platform::CPUPlace();
platform::CPUDeviceContext cpu_ctx(*place);
std::ostringstream oss;
SerializeToStream(oss, src_tensor, cpu_ctx);
TensorToStream(oss, src_tensor, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
TensorFromStream(iss, &dst_tensor, cpu_ctx);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 5; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
......@@ -288,13 +291,13 @@ TEST(Tensor, SerializeAndDeserialize) {
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
std::ostringstream oss;
SerializeToStream(oss, gpu_tensor, gpu_ctx);
TensorToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
TensorFromStream(iss, &dst_tensor, gpu_ctx);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
......
......@@ -31,7 +31,7 @@ static __global__ void FillInf(float* buf) {
buf[2] = 0.5;
}
TEST(HasNAN, GPU) {
TEST(TensorContainsNAN, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
......@@ -39,10 +39,10 @@ TEST(HasNAN, GPU) {
float* buf = tensor.mutable_data<float>({3}, gpu);
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasNAN(tensor));
ASSERT_TRUE(TensorContainsNAN(tensor));
}
TEST(HasInf, GPU) {
TEST(TensorContainsInf, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
......@@ -50,7 +50,7 @@ TEST(HasInf, GPU) {
float* buf = tensor.mutable_data<float>({3}, gpu);
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasInf(tensor));
ASSERT_TRUE(TensorContainsInf(tensor));
}
} // namespace framework
......
......@@ -64,7 +64,6 @@ class ThreadPool {
Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
try {
fn();
return nullptr;
} catch (platform::EnforceNotMet ex) {
return std::unique_ptr<platform::EnforceNotMet>(
new platform::EnforceNotMet(ex));
......@@ -73,6 +72,7 @@ class ThreadPool {
<< "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet.";
}
return nullptr;
});
std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
tasks_.push(std::move(task));
......
......@@ -42,7 +42,7 @@ class ArrayOp : public framework::OperatorBase {
if (platform::is_gpu_place(i_tensor.place())) {
// FIXME: Avoid copy from GPU to CPU
framework::Tensor t;
framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
dev_ctx.Wait();
offset = static_cast<size_t>(*t.data<int64_t>());
} else {
......
......@@ -112,8 +112,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice);
framework::TensorCopy(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice);
out_offset += len;
}
}
......
......@@ -45,7 +45,7 @@ class AssignFunctor {
out_rows.set_height(rows.height());
auto &t = rows.value();
auto *m = out_rows.mutable_value();
framework::Copy(t, t.place(), dev_ctx_, m);
framework::TensorCopy(t, t.place(), dev_ctx_, m);
}
template <typename T>
......@@ -57,7 +57,7 @@ class AssignFunctor {
void copy_tensor(const framework::LoDTensor &lod_tensor,
framework::LoDTensor *out) const {
auto &out_tensor = *out;
Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
out_tensor.set_lod(lod_tensor.lod());
}
......
......@@ -41,7 +41,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
break;
}
auto values = ctx.Attr<std::vector<T>>(value_name);
framework::CopyFromVector(values, ctx.device_context(), out);
framework::TensorFromVector(values, ctx.device_context(), out);
out->Resize(framework::make_ddim(shape));
}
};
......
......@@ -232,12 +232,12 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
id_tensor->set_lod(lod);
id_tensor->Resize({static_cast<int64_t>(id_data.size())});
id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
framework::TensorFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
score_tensor->set_lod(lod);
score_tensor->Resize({static_cast<int64_t>(score_data.size())});
score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
framework::TensorFromVector<T>(score_data, cpu_ctx, score_tensor);
}
template <typename T>
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
......@@ -98,16 +98,16 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
T* conf_data = conf_tensor.data<T>();
if (platform::is_gpu_place(context.GetPlace())) {
loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
framework::Copy(loc_tensor, platform::CPUPlace(),
context.device_context(), &loc_cpu);
framework::TensorCopy(loc_tensor, platform::CPUPlace(),
context.device_context(), &loc_cpu);
loc_data = loc_cpu.data<T>();
conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
framework::Copy(conf_tensor, platform::CPUPlace(),
context.device_context(), &conf_cpu);
framework::TensorCopy(conf_tensor, platform::CPUPlace(),
context.device_context(), &conf_cpu);
conf_data = conf_cpu.data<T>();
priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
framework::Copy(*in_priorbox, platform::CPUPlace(),
context.device_context(), &priorbox_cpu);
framework::TensorCopy(*in_priorbox, platform::CPUPlace(),
context.device_context(), &priorbox_cpu);
priorbox_data = priorbox_cpu.data<T>();
}
// get decode bboxes
......@@ -158,8 +158,8 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
batch_size, all_indices, all_decoded_bboxes,
out_data);
if (platform::is_gpu_place(context.GetPlace())) {
framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
out);
framework::TensorCopy(out_cpu, platform::CUDAPlace(),
context.device_context(), out);
}
}
};
......
......@@ -126,7 +126,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
out0->mutable_data<T>(context.GetPlace());
framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
out0);
} else {
switch (dims) {
REP_EXPAND_GRAD_TEMPLATE(72)
......
......@@ -57,7 +57,7 @@ class FeedOp : public framework::OperatorBase {
if (platform::is_same_place(feed_item.place(), place)) {
out_item->ShareDataWith(feed_item);
} else {
framework::Copy(feed_item, place, dev_ctx, out_item);
framework::TensorCopy(feed_item, place, dev_ctx, out_item);
}
out_item->set_lod(feed_item.lod());
}
......
......@@ -56,7 +56,7 @@ class FetchOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(src_item.place());
Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait();
dst_item.set_lod(src_item.lod());
......
......@@ -74,7 +74,7 @@ class FillOp : public framework::OperatorBase {
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Copy(tensor, place, dev_ctx, &out);
framework::TensorCopy(tensor, place, dev_ctx, &out);
}
}
};
......
......@@ -196,7 +196,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
// dy_dx
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
ctx, &d_y, scale, /*axis*/ 1, MulFunctor<T>(), &temp);
framework::Copy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, temp, &temp_vec);
......@@ -208,7 +208,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor<T>(), &temp);
} else {
// dy_dx
framework::Copy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x);
// dy_dmean_dx
row_mean(dev_ctx, d_y, &temp_vec);
......
......@@ -69,7 +69,7 @@ class LoadCombineOp : public framework::OperatorBase {
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod());
Copy(cpu_tensor, place, dev_ctx, tensor);
TensorCopy(cpu_tensor, place, dev_ctx, tensor);
}
}
}
......
......@@ -55,7 +55,7 @@ class LoadOp : public framework::OperatorBase {
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod());
Copy(cpu_tensor, place, dev_ctx, tensor);
TensorCopy(cpu_tensor, place, dev_ctx, tensor);
}
}
};
......
......@@ -33,8 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
auto* lod = lod_t->data<int>();
if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor lod_cpu;
framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
&lod_cpu);
framework::TensorCopy(*lod_t, platform::CPUPlace(),
ctx.device_context(), &lod_cpu);
lod = lod_cpu.data<int>();
}
level0 = std::vector<int>(lod, lod + lod_t->numel());
......
......@@ -94,9 +94,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Copy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
offset += len;
}
}
......
......@@ -149,7 +149,8 @@ class ContextProjectFunctor {
Tensor out_t_sub = out_t.Slice(k * context_length,
k * context_length + padding_size);
Tensor w_sub = padding_data.Slice(k, k + padding_size);
framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
framework::TensorCopy(w_sub, context.GetPlace(), context,
&out_t_sub);
}
}
if (down_pad > 0) { // add down pad
......@@ -179,7 +180,8 @@ class ContextProjectFunctor {
(down_pad_begin_row + t) * context_length);
Tensor w_sub = padding_data.Slice(
up_pad + padding_idx, up_pad + padding_idx + padding_size);
framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
framework::TensorCopy(w_sub, context.GetPlace(), context,
&out_t_sub);
}
}
out_t.Resize({sequence_height, context_length * sequence_width});
......
......@@ -62,7 +62,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
Copy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
......@@ -87,7 +87,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -98,7 +98,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
out_ocf_ptr = output_tmp.data<float>();
}
......@@ -119,7 +119,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
Copy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
......@@ -128,7 +128,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -140,7 +140,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
Copy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
......@@ -148,7 +148,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......
......@@ -29,15 +29,15 @@ TEST(math_function, notrans_mul_trans) {
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -63,15 +63,15 @@ TEST(math_function, trans_mul_notrans) {
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -112,9 +112,9 @@ TEST(math_function, gemm_notrans_cublas) {
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
......@@ -122,7 +122,7 @@ TEST(math_function, gemm_notrans_cublas) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -167,9 +167,9 @@ TEST(math_function, gemm_trans_cublas) {
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
......@@ -177,7 +177,7 @@ TEST(math_function, gemm_trans_cublas) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);
......@@ -218,15 +218,15 @@ void GemvTest(int m, int n, bool trans) {
}
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a);
paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c);
paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
&vec_c);
paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context,
&vec_c);
if (!trans) {
for (int i = 0; i < m; ++i) {
......
......@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu;
Copy(*out_value, cpu_place, ctx, &out_cpu);
TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>();
......@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu;
Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>();
......@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu;
Copy(*out_value, cpu_place, ctx, &out_cpu);
TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>();
......@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
add_to_tensor_functor(ctx, *output, tensor1.get());
Tensor tensor1_cpu;
Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
ctx.Wait();
auto* tensor1_cpu_data = tensor1_cpu.data<float>();
......
......@@ -97,7 +97,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
Copy(seq, context.GetPlace(), context, &padding);
TensorCopy(seq, context.GetPlace(), context, &padding);
padding.Resize(padding_dims);
return;
}
......@@ -172,7 +172,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
Copy(padding, context.GetPlace(), context, &seq);
TensorCopy(padding, context.GetPlace(), context, &seq);
seq.Resize(seq_dims);
return;
}
......
......@@ -40,7 +40,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) {
seq = cpu_seq;
} else {
Copy(cpu_seq, *place, *context, &seq);
TensorCopy(cpu_seq, *place, *context, &seq);
seq.set_lod(lod);
}
......@@ -63,7 +63,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back;
} else {
Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
cpu_seq_back.set_lod(lod);
}
......
......@@ -71,7 +71,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
Copy(input_tmp, *place, *context, &input);
paddle::framework::TensorCopy(input_tmp, *place, *context, &input);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
......@@ -85,7 +85,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
out_cfo_ptr = output_tmp.data<float>();
}
......@@ -99,7 +99,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
Copy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input);
}
paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
......@@ -109,7 +109,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>();
}
......
......@@ -51,7 +51,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
cpu_mask->ShareDataWith(mask);
} else if (platform::is_gpu_place(mask.place())) {
#ifdef PADDLE_WITH_CUDA
framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
cpu_mask.get());
#else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif
......@@ -106,8 +107,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
continue;
}
auto slice = out->Slice(out_offset, out_offset + len);
framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
&slice);
framework::TensorCopy(input->Slice(start_offset, end_offset), place,
dev_ctx, &slice);
out_offset += len;
(*in_idx) += 1;
}
......
......@@ -67,7 +67,8 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
auto out_match_indices =
ctx.Output<framework::Tensor>("UpdatedMatchIndices");
framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
framework::TensorCopy(*in_matched_indices, ctx.GetPlace(),
out_match_indices);
int batch_size = in_matched_indices->dims()[0];
int prior_num = in_matched_indices->dims()[1];
......
......@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
......@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream();
......
......@@ -98,7 +98,7 @@ class NCCLTester : public ::testing::Test {
send_tensor->mutable_data<T>(kDims, place);
std::vector<T> send_vector(f::product(kDims), gpu_id);
paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
}
......
......@@ -78,7 +78,7 @@ inline void CopyOrShare(const framework::Variable &src,
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
......@@ -88,7 +88,7 @@ inline void CopyOrShare(const framework::Variable &src,
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
dst_sr->set_rows(src_sr.rows());
} else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
......@@ -146,7 +146,7 @@ class ParallelDoOp : public framework::OperatorBase {
auto &place = places[i];
auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::Copy(src, place, dst);
framework::TensorCopy(src, place, dst);
}
}
WaitOnPlaces(places);
......
......@@ -179,7 +179,7 @@ class TensorPrintOp : public framework::OperatorBase {
} else {
// copy data to cpu to print
platform::CPUPlace place;
framework::Copy(in_tensor, place, &printed_tensor);
framework::TensorCopy(in_tensor, place, &printed_tensor);
}
Formater formater;
......
......@@ -291,7 +291,7 @@ class RecurrentOp : public RecurrentBase {
auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
// Explicit copy output since the local RNN scope can be destroyed
// early.
framework::Copy(src_tensor, place, dev_ctx, &dst_out);
framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
});
scopes.Next();
......@@ -378,7 +378,7 @@ class RecurrentGradOp : public RecurrentBase {
auto *cur_grad_var = cur_scope.Var(cur_grad);
auto cur_grad_tensor =
cur_grad_var->GetMutable<framework::LoDTensor>();
framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
}
}
......@@ -452,7 +452,7 @@ class RecurrentGradOp : public RecurrentBase {
}
auto dst = outside->Slice(seq_offset, seq_offset + 1);
framework::Copy(inside, place, dev_ctx, &dst);
framework::TensorCopy(inside, place, dev_ctx, &dst);
});
VLOG(5) << "Link outside gradient finished ";
......@@ -465,7 +465,7 @@ class RecurrentGradOp : public RecurrentBase {
framework::LoDTensor *outside) {
outside->Resize(inside.dims());
outside->mutable_data(place, inside.type());
framework::Copy(inside, place, dev_ctx, outside);
framework::TensorCopy(inside, place, dev_ctx, outside);
});
VLOG(5) << "Link initialize state gradient finished ";
}
......
......@@ -170,7 +170,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
framework::TensorCopy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
out_offset += len;
return out_offset;
}
......
......@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>("X");
auto out_dims = out->dims();
out->mutable_data<T>(ctx.GetPlace());
framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
out->Resize(out_dims);
}
};
......@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
d_x->mutable_data<T>(ctx.GetPlace());
auto in_dims = d_x->dims();
framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
d_x->Resize(in_dims);
}
};
......
......@@ -61,7 +61,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
}
}
framework::Copy(*in, context.GetPlace(), out);
framework::TensorCopy(*in, context.GetPlace(), out);
out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
}
};
......@@ -77,7 +77,7 @@ class SequenceReshapeGradKernel : public framework::OpKernel<T> {
context.Output<LoDTensor>(framework::GradVarName("X"));
xg_tensor_ptr->mutable_data<T>(context.GetPlace());
framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
framework::TensorCopy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
xg_tensor_ptr->Resize(x_tensor_ptr->dims());
}
};
......
......@@ -66,13 +66,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu);
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu);
offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu);
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu);
length_data = length_cpu.data<int64_t>();
}
......@@ -127,13 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu);
framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu);
offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu);
framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu);
length_data = length_cpu.data<int64_t>();
}
......
......@@ -133,7 +133,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0];
auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
framework::TensorCopy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
......
......@@ -55,7 +55,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
cpu_mask->ShareDataWith(mask);
} else if (platform::is_gpu_place(mask.place())) {
#ifdef PADDLE_WITH_CUDA
framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
cpu_mask.get());
#else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif
......@@ -113,9 +114,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
// out[offset: offset+len] = x[each_range.begin: each_range.end]
auto slice = out->Slice(static_cast<int>(offset),
static_cast<int>(offset + len));
framework::Copy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
framework::TensorCopy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice);
offset += len;
}
}
......
......@@ -137,8 +137,8 @@ class SumKernel : public framework::OpKernel<T> {
out_array.resize(i + 1);
}
if (out_array[i].numel() == 0) {
framework::Copy(in_array[i], in_array[i].place(),
context.device_context(), &out_array[i]);
framework::TensorCopy(in_array[i], in_array[i].place(),
context.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
......
......@@ -45,7 +45,7 @@ class WriteToArrayOp : public ArrayOp {
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
Copy(x_tensor, place, dev_ctx, out_tensor);
TensorCopy(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod());
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
......@@ -138,7 +138,7 @@ class ReadFromArrayOp : public ArrayOp {
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod());
} else {
VLOG(10) << "offset " << offset << " >= " << x_array.size();
......
......@@ -185,7 +185,8 @@ class WarpCTCKernel : public framework::OpKernel<T> {
// warpctc accesses labels in CPU memory
Tensor warpctc_label;
Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
&warpctc_label);
const int* warpctc_label_data = warpctc_label.data<int>();
// warpctc stores loss in CPU memory
Tensor warpctc_loss;
......@@ -200,7 +201,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
sequence_width, num_sequences, blank, warpctc_loss_data);
// Copy the loss back
Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
TensorCopy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
}
};
......
......@@ -101,7 +101,7 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
return self.data<T>()[offset];
} else {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::Copy(self, platform::CPUPlace(), dst.get());
framework::TensorCopy(self, platform::CPUPlace(), dst.get());
return dst->data<T>()[offset];
}
}
......@@ -111,9 +111,9 @@ template <typename T>
void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
if (platform::is_gpu_place(self.place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::Copy(self, platform::CPUPlace(), dst.get());
framework::TensorCopy(self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem;
framework::Copy(*dst.get(), self.place(), &self);
framework::TensorCopy(*dst.get(), self.place(), &self);
} else if (platform::is_cpu_place(self.place())) {
self.data<T>()[offset] = elem;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册