/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include #include #include namespace paddle { namespace framework { void TensorCopy(const Tensor& src, const platform::Place& dst_place, const platform::DeviceContext& ctx, Tensor* dst) { VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; src.check_memory_size(); dst->Resize(src.dims()); dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); auto stream = reinterpret_cast(ctx).stream(); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { // NOTE(zcd): Because TensorCopy is an async operation, when the src_place // and dst_place are two different GPU, to ensure that the operation can // be carried out correctly, we should make ctx wait. // If ctx_place and src_place are the same, we should add ctx.Wait() // after memory::Copy; if ctx_place and dst_place are the same, we should // add ctx.Wait() before memory::Copy. if (platform::is_same_place(ctx_place, src_place)) { memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); ctx.Wait(); } else if (platform::is_same_place(ctx_place, dst_place)) { ctx.Wait(); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); } } } #endif } void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { // NOTE(zcd): If the src.place() and dst_place are two different GPU, // the copy operation is carried out on the dst_place's stream. This is // very important, because TensorCopy is an async operator, and in most // case, once this copy operator returns, dst is to be used in dst_place's // stream, if this copy operation is carried out on the src_place's stream, // when dst is used in dst_place's stream the copy operation may be // not completed. platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; if (platform::is_gpu_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); } TensorCopy(src, dst_place, *dev_ctx, dst); } void TensorCopySync(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() << " to " << dst_place; src.check_memory_size(); dst->Resize(src.dims()); dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); auto dst_ptr = dst->mutable_data(dst_place, src.type()); auto size = src.numel() * SizeOfType(src.type()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); } #endif } template struct AnyDTypeVisitor { Predicate predicate_; const Tensor& tensor_; const DevCtx& ctx_; Tensor* out_; AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, Tensor* out) : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} template void operator()() const { auto t = EigenVector::Flatten(tensor_); auto o = EigenScalar::From(*out_); // return any of predicate_(t) is true. o.device(*ctx_.eigen_device()) = predicate_(t).any(); } }; template inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, const DevCtx& ctx, framework::Tensor* out) { VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( predicate, tensor, ctx, out)); } template struct AnyVisitor : public boost::static_visitor { const framework::Tensor& tensor_; Predicate predicate_; AnyVisitor(const framework::Tensor& tensor, Predicate predicate) : tensor_(tensor), predicate_(std::move(predicate)) {} template bool operator()(const Place& place) const { framework::Tensor out; out.Resize({1}); out.mutable_data(place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); AnyImpl(predicate_, tensor_, *ctx, &out); return this->GetResult(out, place); } bool GetResult(const framework::Tensor& out, const platform::CUDAPlace& gpu) const { platform::CPUPlace cpu; framework::Tensor tmp; tmp.Resize({1}); tmp.mutable_data(cpu); auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); gpuctx->Wait(); TensorCopy(out, cpu, *gpuctx, &tmp); gpuctx->Wait(); return GetResult(tmp, cpu); } bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); } bool GetResult(const framework::Tensor& out, const platform::CUDAPinnedPlace& cpu) const { return *out.data(); } }; template inline bool Any(const framework::Tensor& tensor, Predicate predicate) { AnyVisitor visitor(tensor, predicate); auto place = tensor.place(); return platform::VisitPlace(place, visitor); } struct ContainsNANPredicate { template auto operator()(const T& eigen_vec) const -> decltype(std::declval().isnan()) { // Cast eigen_vector to vector of bool. true if is inf. return eigen_vec.isnan(); } }; bool TensorContainsNAN(const framework::Tensor& tensor) { ContainsNANPredicate predicate; return Any(tensor, predicate); } struct ContainsInfPredicate { template auto operator()(const T& eigen_vec) const -> decltype(std::declval().isinf()) { // Cast eigen_vector to vector of bool. true if is inf. return eigen_vec.isinf(); } }; bool TensorContainsInf(const framework::Tensor& tensor) { ContainsInfPredicate predicate; return Any(tensor, predicate); } void TensorToStream(std::ostream& os, const Tensor& tensor, const platform::DeviceContext& dev_ctx) { { // the 1st field, uint32_t version constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); } { // the 2nd field, tensor description // int32_t size // void* protobuf message proto::VarType::TensorDesc desc; desc.set_data_type(framework::ToDataType(tensor.type())); auto dims = framework::vectorize(tensor.dims()); auto* pb_dims = desc.mutable_dims(); pb_dims->Resize(static_cast(dims.size()), 0); std::copy(dims.begin(), dims.end(), pb_dims->begin()); int32_t size = desc.ByteSize(); os.write(reinterpret_cast(&size), sizeof(size)); auto out = desc.SerializeAsString(); os.write(out.data(), size); } { // the 3rd field, tensor data uint64_t size = tensor.memory_size(); auto* data_ptr = tensor.data(); PADDLE_ENFORCE(size < std::numeric_limits::max(), "Index overflow when writing tensor"); if (platform::is_gpu_place(tensor.place())) { #ifdef PADDLE_WITH_CUDA constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = static_cast(dev_ctx); platform::CPUPlace cpu; uintptr_t data = reinterpret_cast(data_ptr); while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); memory::Copy(cpu, buf.get(), boost::get(tensor.place()), reinterpret_cast(data), size_to_write, gpu_dev_ctx.stream()); gpu_dev_ctx.Wait(); os.write(buf.get(), size_to_write); data += size_to_write; size -= size_to_write; } #else PADDLE_THROW("Unexpected branch"); #endif } else { os.write(static_cast(data_ptr), static_cast(size)); } } } struct DeserializedDataFunctor { DeserializedDataFunctor(void** buf, Tensor* tensor, const platform::Place& place) : buf_(buf), tensor_(tensor), place_(place) {} template void operator()() { *buf_ = tensor_->mutable_data(place_); } void** buf_; Tensor* tensor_; platform::Place place_; }; void TensorFromStream(std::istream& is, Tensor* tensor, const platform::DeviceContext& dev_ctx) { uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); proto::VarType::TensorDesc desc; { // int32_t size // proto buffer int32_t size; is.read(reinterpret_cast(&size), sizeof(size)); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), "Cannot parse tensor desc"); } { // read tensor std::vector dims; dims.reserve(static_cast(desc.dims().size())); std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); tensor->Resize(framework::make_ddim(dims)); void* buf; auto ctx = platform::CPUDeviceContext(); if (platform::is_gpu_place(dev_ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( desc.data_type(), DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); is.read(static_cast(buf), cpu_tensor.memory_size()); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); #else PADDLE_THROW("Unexpected branch"); #endif } else { framework::VisitDataType( desc.data_type(), DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); is.read(static_cast(buf), tensor->memory_size()); } } } } // namespace framework } // namespace paddle