From 9da7b33515f760965990d4dd736b90e7de20ce58 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 1 Nov 2018 22:52:25 +0800 Subject: [PATCH] details --- paddle/fluid/framework/data_type_transform.cu | 107 +----- paddle/fluid/framework/tensor_util.cu | 363 +----------------- 2 files changed, 2 insertions(+), 468 deletions(-) diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu index d79f8cacb5f..f46491293ef 120000 --- a/paddle/fluid/framework/data_type_transform.cu +++ b/paddle/fluid/framework/data_type_transform.cu @@ -1,106 +1 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" - -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace framework { - -template -struct CastDataTypeFunctor { - HOSTDEVICE inline OutType operator()(InType in) const { - return static_cast(in); - } -}; - -template -struct CastDataType { - CastDataType(const framework::Tensor& in, framework::Tensor* out, - const platform::DeviceContext* ctx) - : in_(in), out_(out), ctx_(ctx) {} - const framework::Tensor in_; - framework::Tensor* out_; - const platform::DeviceContext* ctx_; - - template - void apply() { - auto* in_begin = in_.data(); - auto* in_end = in_begin + in_.numel(); - auto* out_begin = out_->mutable_data(in_.place()); - - if (platform::is_cpu_place(in_.place())) { - platform::Transform trans; - auto* context = static_cast(ctx_); - trans(*context, in_begin, in_end, out_begin, - CastDataTypeFunctor()); -#ifdef __NVCC__ - } else if (platform::is_gpu_place(in_.place())) { - platform::Transform trans; - auto* context = static_cast(ctx_); - trans(*context, in_begin, in_end, out_begin, - CastDataTypeFunctor()); - context->Wait(); -#endif - } else { - PADDLE_THROW("Unsupported place!"); - } - } -}; - -void TransDataType(const OpKernelType& kernel_type_for_var, - const OpKernelType& expected_kernel_type, const Tensor& in, - Tensor* out) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - - out->Resize(in.dims()); - auto src_type = kernel_type_for_var.data_type_; - auto dst_type = expected_kernel_type.data_type_; - auto ctx = pool.Get(in.place()); - - switch (src_type) { - case proto::VarType::FP16: - framework::VisitDataType(dst_type, - CastDataType(in, out, ctx)); - break; - case proto::VarType::FP32: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::FP64: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::INT32: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::INT64: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::BOOL: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::INT16: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - case proto::VarType::UINT8: - framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); - break; - default: - PADDLE_THROW("Not support type %d", src_type); - } -} - -} // namespace framework -} // namespace paddle +data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu index 05c4a17a01c..edd88c4e547 120000 --- a/paddle/fluid/framework/tensor_util.cu +++ b/paddle/fluid/framework/tensor_util.cu @@ -1,362 +1 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -#include "paddle/fluid/framework/tensor_util.h" -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" - -namespace paddle { -namespace framework { - -void TensorCopy(const Tensor& src, const platform::Place& dst_place, - const platform::DeviceContext& ctx, Tensor* dst) { - VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " - << dst_place; - src.check_memory_size(); - - dst->Resize(src.dims()); - dst->set_layout(src.layout()); - auto src_place = src.place(); - auto src_ptr = src.data(); - - auto dst_ptr = dst->mutable_data(dst_place, src.type()); - - auto size = src.numel() * SizeOfType(src.type()); - - if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size); - } -#ifdef PADDLE_WITH_CUDA - else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_cpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (platform::is_cpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_cpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); - PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); - } else if (platform::is_gpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto stream = - reinterpret_cast(ctx).stream(); - if (platform::is_same_place(src_place, dst_place)) { - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, - stream); - } else { - if (platform::is_same_place(ctx_place, src_place)) { - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, - stream); - platform::DeviceContextPool::Instance().Get(src.place())->Wait(); - } else if (platform::is_same_place(ctx_place, dst_place)) { - platform::DeviceContextPool::Instance().Get(src.place())->Wait(); - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, - stream); - } else { - PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); - } - } - } -#endif -} - -void TensorCopy(const Tensor& src, const platform::Place& dst_place, - Tensor* dst) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(dst_place)) { - dev_ctx = pool.Get(dst_place); - } else { - dev_ctx = pool.Get(src.place()); - } - TensorCopy(src, dst_place, *dev_ctx, dst); -} - -void TensorCopySync(const Tensor& src, const platform::Place& dst_place, - Tensor* dst) { - VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() - << " to " << dst_place; - src.check_memory_size(); - dst->Resize(src.dims()); - dst->set_layout(src.layout()); - auto src_place = src.place(); - auto src_ptr = src.data(); - auto dst_ptr = dst->mutable_data(dst_place, src.type()); - auto size = src.numel() * SizeOfType(src.type()); - if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { - memory::Copy(boost::get(dst_place), dst_ptr, - boost::get(src_place), src_ptr, size); - } -#ifdef PADDLE_WITH_CUDA - else if (platform::is_gpu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_cpu_place = boost::get(dst_place); - memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); - } else if (platform::is_cpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_cpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); - } else if (platform::is_gpu_place(src_place) && - platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); - memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); - } -#endif -} - -template -struct AnyDTypeVisitor { - Predicate predicate_; - const Tensor& tensor_; - const DevCtx& ctx_; - Tensor* out_; - - AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, - Tensor* out) - : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} - - template - void apply() const { - auto t = EigenVector::Flatten(tensor_); - auto o = EigenScalar::From(*out_); - // return any of predicate_(t) is true. - o.device(*ctx_.eigen_device()) = predicate_(t).any(); - } -}; - -template -inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, - const DevCtx& ctx, framework::Tensor* out) { - VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( - predicate, tensor, ctx, out)); -} - -template -struct AnyVisitor : public boost::static_visitor { - const framework::Tensor& tensor_; - Predicate predicate_; - - AnyVisitor(const framework::Tensor& tensor, Predicate predicate) - : tensor_(tensor), predicate_(std::move(predicate)) {} - - template - bool operator()(const Place& place) const { - framework::Tensor out; - out.Resize({1}); - out.mutable_data(place); - auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); - AnyImpl(predicate_, tensor_, *ctx, &out); - return this->GetResult(out, place); - } - - bool GetResult(const framework::Tensor& out, - const platform::CUDAPlace& gpu) const { - platform::CPUPlace cpu; - framework::Tensor tmp; - tmp.Resize({1}); - tmp.mutable_data(cpu); - auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); - gpuctx->Wait(); - TensorCopy(out, cpu, *gpuctx, &tmp); - gpuctx->Wait(); - return GetResult(tmp, cpu); - } - - bool GetResult(const framework::Tensor& out, - const platform::CPUPlace& cpu) const { - return *out.data(); - } - - bool GetResult(const framework::Tensor& out, - const platform::CUDAPinnedPlace& cpu) const { - return *out.data(); - } -}; - -template -inline bool Any(const framework::Tensor& tensor, Predicate predicate) { - AnyVisitor visitor(tensor, predicate); - auto place = tensor.place(); - return platform::VisitPlace(place, visitor); -} - -struct ContainsNANPredicate { - template - auto operator()(const T& eigen_vec) const - -> decltype(std::declval().isnan()) { - // Cast eigen_vector to vector of bool. true if is inf. - return eigen_vec.isnan(); - } -}; - -bool TensorContainsNAN(const framework::Tensor& tensor) { - ContainsNANPredicate predicate; - return Any(tensor, predicate); -} - -struct ContainsInfPredicate { - template - auto operator()(const T& eigen_vec) const - -> decltype(std::declval().isinf()) { - // Cast eigen_vector to vector of bool. true if is inf. - return eigen_vec.isinf(); - } -}; - -bool TensorContainsInf(const framework::Tensor& tensor) { - ContainsInfPredicate predicate; - return Any(tensor, predicate); -} - -void TensorToStream(std::ostream& os, const Tensor& tensor, - const platform::DeviceContext& dev_ctx) { - { // the 1st field, uint32_t version - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); - } - { // the 2nd field, tensor description - // int32_t size - // void* protobuf message - proto::VarType::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); - auto dims = framework::vectorize(tensor.dims()); - auto* pb_dims = desc.mutable_dims(); - pb_dims->Resize(static_cast(dims.size()), 0); - std::copy(dims.begin(), dims.end(), pb_dims->begin()); - int32_t size = desc.ByteSize(); - os.write(reinterpret_cast(&size), sizeof(size)); - auto out = desc.SerializeAsString(); - os.write(out.data(), size); - } - { // the 3rd field, tensor data - uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type()); - - auto* data_ptr = tensor.data(); - PADDLE_ENFORCE(size < std::numeric_limits::max(), - "Index overflow when writing tensor"); - if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA - constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB - std::unique_ptr buf(new char[kBufSize]); - auto& gpu_dev_ctx = - static_cast(dev_ctx); - platform::CPUPlace cpu; - uintptr_t data = reinterpret_cast(data_ptr); - while (size != 0) { - size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), - reinterpret_cast(data), size_to_write, - gpu_dev_ctx.stream()); - gpu_dev_ctx.Wait(); - os.write(buf.get(), size_to_write); - data += size_to_write; - size -= size_to_write; - } -#else - PADDLE_THROW("Unexpected branch"); -#endif - } else { - os.write(static_cast(data_ptr), - static_cast(size)); - } - } -} - -struct DeserializedDataFunctor { - DeserializedDataFunctor(void** buf, Tensor* tensor, - const platform::Place& place) - : buf_(buf), tensor_(tensor), place_(place) {} - - template - void apply() { - *buf_ = tensor_->mutable_data(place_); - } - - void** buf_; - Tensor* tensor_; - platform::Place place_; -}; - -void TensorFromStream(std::istream& is, Tensor* tensor, - const platform::DeviceContext& dev_ctx) { - uint32_t version; - is.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); - proto::VarType::TensorDesc desc; - { // int32_t size - // proto buffer - int32_t size; - is.read(reinterpret_cast(&size), sizeof(size)); - std::unique_ptr buf(new char[size]); - is.read(reinterpret_cast(buf.get()), size); - PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), - "Cannot parse tensor desc"); - } - { // read tensor - std::vector dims; - dims.reserve(static_cast(desc.dims().size())); - std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); - tensor->Resize(framework::make_ddim(dims)); - void* buf; - auto ctx = platform::CPUDeviceContext(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); - if (platform::is_gpu_place(dev_ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA - Tensor cpu_tensor; - cpu_tensor.Resize(framework::make_ddim(dims)); - framework::VisitDataType( - desc.data_type(), - DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); - is.read(static_cast(buf), size); - auto dst_place = dev_ctx.GetPlace(); - framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); -#else - PADDLE_THROW("Unexpected branch"); -#endif - } else { - framework::VisitDataType( - desc.data_type(), - DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); - is.read(static_cast(buf), size); - } - } -} - -} // namespace framework -} // namespace paddle +tensor_util.cc \ No newline at end of file -- GitLab