diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1e36114c67091f3b3e2acfe920e54efa97abcd2c..eee746067af3614ee6ca274b3fbb7a6cfa98e500 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) - if (WIN32) - windows_symbolic(tensor_util SRCS tensor_util.cu) - nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) - add_dependencies(tensor tensor_util) - else() + # // if (WIN32) + # // windows_symbolic(tensor_util SRCS tensor_util.cu) + # // nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) + # // add_dependencies(tensor tensor_util) + # // else() nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) - endif(WIN32) + # endif(WIN32) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) endif() @@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu DEPS operator op_registry device_context math_function) if(WITH_GPU) - if (WIN32) - # windows treat symbolic file as a real file, which is different with unix - # We create a hidden file and compile it instead of origin source file. - windows_symbolic(hidden_file SRCS data_type_transform.cu) - nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) - add_dependencies(data_type_transform hidden_file) - else() + # if (WIN32) + # # windows treat symbolic file as a real file, which is different with unix + # # We create a hidden file and compile it instead of origin source file. + # windows_symbolic(hidden_file SRCS data_type_transform.cu) + # nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) + # add_dependencies(data_type_transform hidden_file) + # else() nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) - endif(WIN32) + # endif(WIN32) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) else() cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu index 7dd9cb5cfd440ca05a471e4731e1fc68fafb3588..d79f8cacb5f4727defc77380371e57bcea65f068 120000 --- a/paddle/fluid/framework/data_type_transform.cu +++ b/paddle/fluid/framework/data_type_transform.cu @@ -1,15 +1,106 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -data_type_transform.cc \ No newline at end of file +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void apply() { + auto* in_begin = in_.data(); + auto* in_end = in_begin + in_.numel(); + auto* out_begin = out_->mutable_data(in_.place()); + + if (platform::is_cpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); +#ifdef __NVCC__ + } else if (platform::is_gpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + context->Wait(); +#endif + } else { + PADDLE_THROW("Unsupported place!"); + } + } +}; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + out->Resize(in.dims()); + auto src_type = kernel_type_for_var.data_type_; + auto dst_type = expected_kernel_type.data_type_; + auto ctx = pool.Get(in.place()); + + switch (src_type) { + case proto::VarType::FP16: + framework::VisitDataType(dst_type, + CastDataType(in, out, ctx)); + break; + case proto::VarType::FP32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::FP64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::BOOL: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT16: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::UINT8: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 03ed6da1046495ade890f1e8e0c34654883254b7..d2f729afc4870ce0fd464b849bf194a6497e3b28 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -17,8 +17,11 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { - -constexpr char Node::kControlDepVarName[]; +#if !defined(_WIN32) +constexpr char Node::kControlDepVarName[] = "__control_var"; +#else +const char Node::kControlDepVarName[] = "__control_var"; +#endif int Node::count_ = 0; } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d53d789d3ad27b8f9606a396264d91e5f07a9d10..6c16bfeea5ff08d5a7499349655eb4749b3c8eb9 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -27,7 +27,11 @@ namespace ir { class Node { public: enum class Type { kOperation, kVariable }; +#if !defined(_WIN32) // msvc not support constexpr correctly. static constexpr char kControlDepVarName[] = "__control_var"; +#else + static const char kControlDepVarName[]; +#endif explicit Node(const std::string& name, Type type) : name_(name), diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a5168245a6f5354833b72a5751616a2c33b56d4d..5b29f0cd4b0c75384a7d0c067728e0c919ec2dd2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, auto expected_kernel_key = this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + VLOG(3) << "expected_kernel_key: " << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu index 251c3a5e4097b7735bd9e2e9e3972bedae95d67e..05c4a17a01c6fabe48f3fe18544c13153feb0673 120000 --- a/paddle/fluid/framework/tensor_util.cu +++ b/paddle/fluid/framework/tensor_util.cu @@ -1,15 +1,362 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -tensor_util.cc \ No newline at end of file +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" + +namespace paddle { +namespace framework { + +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst) { + VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " + << dst_place; + src.check_memory_size(); + + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto stream = + reinterpret_cast(ctx).stream(); + if (platform::is_same_place(src_place, dst_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + if (platform::is_same_place(ctx_place, src_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + } else if (platform::is_same_place(ctx_place, dst_place)) { + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); + } + } + } +#endif +} + +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + if (platform::is_gpu_place(dst_place)) { + dev_ctx = pool.Get(dst_place); + } else { + dev_ctx = pool.Get(src.place()); + } + TensorCopy(src, dst_place, *dev_ctx, dst); +} + +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() + << " to " << dst_place; + src.check_memory_size(); + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + auto size = src.numel() * SizeOfType(src.type()); + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } +#endif +} + +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void apply() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +struct AnyVisitor : public boost::static_visitor { + const framework::Tensor& tensor_; + Predicate predicate_; + + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + TensorCopy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPinnedPlace& cpu) const { + return *out.data(); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +struct ContainsNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool TensorContainsNAN(const framework::Tensor& tensor) { + ContainsNANPredicate predicate; + return Any(tensor, predicate); +} + +struct ContainsInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool TensorContainsInf(const framework::Tensor& tensor) { + ContainsInfPredicate predicate; + return Any(tensor, predicate); +} + +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::VarType::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type()); + + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +struct DeserializedDataFunctor { + DeserializedDataFunctor(void** buf, Tensor* tensor, + const platform::Place& place) + : buf_(buf), tensor_(tensor), place_(place) {} + + template + void apply() { + *buf_ = tensor_->mutable_data(place_); + } + + void** buf_; + Tensor* tensor_; + platform::Place place_; +}; + +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::VarType::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + void* buf; + auto ctx = platform::CPUDeviceContext(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + Tensor cpu_tensor; + cpu_tensor.Resize(framework::make_ddim(dims)); + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); + auto dst_place = dev_ctx.GetPlace(); + framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc index f7c199d0d10ccb0ac3ca5f8e407b640e3d0f406a..1d7876359b3228fca1d6d07dfe8177daaa8cd26f 100644 --- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc +++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc @@ -1,246 +1,157 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - * This file contains a simple demo for how to take a model for inference. - */ -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include //NOLINT -#include "paddle/fluid/inference/paddle_inference_api.h" - -std::string MODELDIR = ""; /* "Directory of the inference model." */ // NOLINT -std::string REFER = ""; -/*"path to reference result for comparison."*/ //NOTLINT -/*path of data; each line is a record, format: -\t - -Please check the demo data of data.txt for details. - */ -std::string DATA = ""; -bool USE_GPU = true; /*"Whether use gpu."*/ - -auto message_err = []() -{ - std::cout << "Copyright (c) 2018 PaddlePaddle Authors." << std::endl; - std::cout << "Demo Case for windows inference. " - << "\n" - << "Usage: Input your model path and use_gpu as the guide requires," - << "then run the demo inference, and will get a result." - << std::endl; - std::cout << std::endl; -}; - -namespace paddle -{ - namespace demo - { - void split(const std::string& str, char sep, - std::vector* pieces) - { - pieces->clear(); - if (str.empty()) - { - return; - } - size_t pos = 0; - size_t next = str.find(sep, pos); - while (next != std::string::npos) - { - pieces->push_back(str.substr(pos, next - pos)); - pos = next + 1; - next = str.find(sep, pos); - } - if (!str.substr(pos).empty()) - { - pieces->push_back(str.substr(pos)); - } - } - - /* - * Get a summary of a PaddleTensor content. - */ - std::string SummaryTensor(const PaddleTensor& tensor) - { - std::stringstream ss; - int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype); - - ss << "data[:10]\t"; - switch (tensor.dtype) - { - case PaddleDType::INT64: - for (int i = 0; i < std::min(num_elems, 10); i++) - { - ss << static_cast(tensor.data.data())[i] << " "; - } - break; - case PaddleDType::FLOAT32: - for (int i = 0; i < std::min(num_elems, 10); i++) - { - ss << static_cast(tensor.data.data())[i] << " "; - } - break; - } - return ss.str(); - } - - std::string ToString(const NativeConfig& config) - { - std::stringstream ss; - ss << "Use GPU : " << (config.use_gpu ? "True" : "False") << "\n" - << "Device : " << config.device << "\n" - << "fraction_of_gpu_memory : " << config.fraction_of_gpu_memory << "\n" - << "specify_input_name : " - << (config.specify_input_name ? "True" : "False") << "\n" - << "Program File : " << config.prog_file << "\n" - << "Param File : " << config.param_file; - return ss.str(); - } - - struct Record - { - std::vector data; - std::vector shape; - }; - - Record ProcessALine(const std::string& line) - { - std::cout << "process a line" << std::endl; - std::vector columns; - split(line, '\t', &columns); - assert(columns.size() == 2UL, "data format error, should be \t"); - - Record record; - std::vector data_strs; - split(columns[0], ' ', &data_strs); - //将数据字符串转换为整型数据并放到record.data中 - for (auto& d : data_strs) - { - record.data.push_back(std::stof(d)); - } - - std::vector shape_strs; - split(columns[1], ' ', &shape_strs); - for (auto& s : shape_strs) - { - record.shape.push_back(std::stoi(s)); - } - std::cout << "data size " << record.data.size() << std::endl; - std::cout << "data shape size " << record.shape.size() << std::endl; - return record; - } - - void CheckOutput(const std::string& referfile, const PaddleTensor& output) - { - std::string line; - std::ifstream file(referfile); - std::getline(file, line); - auto refer = ProcessALine(line); - file.close(); - - size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); - std::cout << "predictor output numel " << numel << std::endl; - std::cout << "reference output numel " << refer.data.size() << std::endl; - assert(numel == refer.data.size()); - switch (output.dtype) - { - case PaddleDType::INT64: - for (size_t i = 0; i < numel; ++i) - { - assert(static_cast(output.data.data())[i] == refer.data[i]); - } - break; - case PaddleDType::FLOAT32: - for (size_t i = 0; i < numel; ++i) - { - assert(fabs(static_cast(output.data.data())[i] - refer.data[i]) <= 1e-5); - } - break; - } - } - - /* - * Use the native fluid engine to inference the demo. - */ - void Main(bool use_gpu) - { - NativeConfig config; - config.model_dir = MODELDIR; - //config.param_file = MODELDIR + "/__params__"; - //config.prog_file = MODELDIR + "/__model__"; - config.use_gpu = USE_GPU; - config.device = 0; - if (USE_GPU) - { - config.fraction_of_gpu_memory = 0.1f; // set by yourself - } - std::cout << ToString(config) << std::endl; - std::cout << "init predictor" << std::endl; - auto predictor = CreatePaddlePredictor(config); - - std::cout << "begin to process data" << std::endl; - // Just a single batch of data. - std::string line; - std::cout << "data : " << std::endl; - std::ifstream file(DATA); - if (!file.is_open()) - { - std::cout << "failed open data" << DATA << std::endl; - exit(0); - } - std::getline(file, line); - auto record = ProcessALine(line); - file.close(); - - // Inference. - PaddleTensor input; - input.shape = record.shape; - input.data = - PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); - input.dtype = PaddleDType::FLOAT32; - - std::cout << "run executor" << std::endl; - std::vector output; - predictor->Run({ input }, &output); - - std::cout << "output.size " << output.size() << std::endl; - auto& tensor = output.front(); - std::cout << "output: " << SummaryTensor(tensor) << std::endl; - - // compare with reference result - std::cout << "refer result : " << REFER << std::endl; - CheckOutput(REFER, tensor); - } - } -} - -int main(int argc, char** argv) -{ - MODELDIR = "./LB_icnet_model"; - //DATA = "./icnet_image.txt"; - DATA = "./1.png.txt"; - REFER = "./icnet_label.txt"; - paddle::demo::Main(USE_GPU); - - system("pause"); - return 0; -} +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +std::string DIRNAME = "./Release/infer_model"; +std::string DATA = "./test-image.txt"; +const int C = 3; // image channel +const int H = 449; // image height +const int W = 581; // image width +// 鏁版嵁鏍煎紡 +// "\t data; + std::vector shape; +}; + +NativeConfig GetConfig() { + NativeConfig config; + config.prog_file=DIRNAME + "/__model__"; + config.param_file=DIRNAME + "/__params__"; + config.fraction_of_gpu_memory = 0.0; + config.use_gpu = true; + config.device = 0; + return config; +} + +using Time = decltype(std::chrono::high_resolution_clock::now()); + +Time time() { return std::chrono::high_resolution_clock::now(); }; + +double time_diff(Time t1, Time t2) { + typedef std::chrono::microseconds ms; + auto diff = t2 - t1; + ms counter = std::chrono::duration_cast(diff); + return counter.count() / 1000.0; +} + +static void split(const std::string& str, char sep, + std::vector* pieces) { + pieces->clear(); + if (str.empty()) { + return; + } + size_t pos = 0; + size_t next = str.find(sep, pos); + while (next != std::string::npos) { + pieces->push_back(str.substr(pos, next - pos)); + pos = next + 1; + next = str.find(sep, pos); + } + if (!str.substr(pos).empty()) { + pieces->push_back(str.substr(pos)); + } +} + +Record ProcessALine(const std::string& line) { + std::vector columns; + split(line, '\t', &columns); + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto& d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto& s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + return record; +} + +void test_naive(int batch_size){ + NativeConfig config = GetConfig(); + auto predictor = CreatePaddlePredictor(config); + int height = H; + int width = W; + int channel = C; + int num_sum = height * width * channel * batch_size; + + // 1. use fake data + std::vector data; + for(int i = 0; i < num_sum; i++) { + data.push_back(0.0); + } + + PaddleTensor tensor; + tensor.shape = std::vector({batch_size, channel, height, width}); + tensor.data.Resize(sizeof(float) * batch_size * channel * height * width); + std::copy(data.begin(), data.end(), static_cast(tensor.data.data())); + tensor.dtype = PaddleDType::FLOAT32; + + // 2. read data from file + // std::string line; + // std::ifstream file(DATA); + // std::getline(file, line); + // auto record = ProcessALine(line); + // file.close(); + // PaddleTensor tensor; + // tensor.shape = record.shape; + // tensor.data = + // PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + + std::vector paddle_tensor_feeds(1, tensor); + PaddleTensor tensor_out; + + std::vector outputs(1, tensor_out); + + predictor->Run(paddle_tensor_feeds, &outputs, batch_size); + auto time1 = time(); + + for(size_t i = 0; i < 2; i++) { + std::cout << "Pass " << i << "predict"; + predictor->Run(paddle_tensor_feeds, &outputs, batch_size); + } + + auto time2 = time(); + std::ofstream ofresult("naive_test_result.txt", std::ios::app); + + std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl; + std::cout << outputs.size() << std::endl; + +} +} // namespace paddle + +int main(int argc, char** argv) { + paddle::test_naive(1 << 0); + return 0; +} \ No newline at end of file diff --git a/paddle/fluid/inference/api/demo_ci/naive_model_test.cc b/paddle/fluid/inference/api/demo_ci/naive_model_test.cc deleted file mode 100644 index 6e6e1aa7b402533b4451f882c146f72e7130ac36..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/api/demo_ci/naive_model_test.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/inference/api/paddle_inference_api.h" - -namespace paddle { - -std::string DIRNAME = "./LB_icnet_model"; -//std::string DIRNAME = "./infer_models"; -NativeConfig GetConfig() { - NativeConfig config; - config.prog_file=DIRNAME + "/__model__"; - config.param_file=DIRNAME + "/__params__"; - config.fraction_of_gpu_memory = 0.8; - config.use_gpu = true; - config.device = 0; - return config; -} - -using Time = decltype(std::chrono::high_resolution_clock::now()); -Time time() { return std::chrono::high_resolution_clock::now(); }; -double time_diff(Time t1, Time t2) { - typedef std::chrono::microseconds ms; - auto diff = t2 - t1; - ms counter = std::chrono::duration_cast(diff); - return counter.count() / 1000.0; -} - -void test_naive(int batch_size){ - NativeConfig config = GetConfig(); - // config.model_dir = model_path; - auto predictor = CreatePaddlePredictor(config); - int height = 449; - int width = 581; - //int height = 3; - //int width = 3; - int num_sum = height * width * 3 * batch_size; - - std::vector data; - - for(int i = 0; i < num_sum; i++) { - data.push_back(0.0); - } - - PaddleTensor tensor; - tensor.shape = std::vector({batch_size, 3, height, width}); - tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width); - std::copy(data.begin(), data.end(), static_cast(tensor.data.data())); - tensor.dtype = PaddleDType::FLOAT32; - std::vector paddle_tensor_feeds(1, tensor); - PaddleTensor tensor_out; - - std::vector outputs(1, tensor_out); - - predictor->Run(paddle_tensor_feeds, &outputs, batch_size); - std::cout << "start predict123:" << std::endl; - auto time1 = time(); - - for(size_t i = 0; i < 2; i++) { - predictor->Run(paddle_tensor_feeds, &outputs, batch_size); - std::cout << "pass " << i; - } - - auto time2 = time(); - std::ofstream ofresult("naive_test_result.txt", std::ios::app); - - std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl; - std::cout << outputs.size() << std::endl; - /* - int64_t * data_o = static_cast(outputs[0].data.data()); - for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) { - ofresult << std::to_string(data_o[j]) << " "; - } - ofresult << std::endl; - ofresult.close(); - */ -} -} // namespace paddle - -int main(int argc, char** argv) { - paddle::test_naive(1 << 0); - return 0; -} \ No newline at end of file diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 22cbf680c0670552fb014043c69fcadc56863529..5bee83c9abb00e0ab097b02d7e12b74cc10d66ad 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -43,6 +43,7 @@ template class CUDNNConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + VLOG(3) << "inside cudnn"; PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); auto* input = ctx.Input("Input"); @@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); T* output_data = output->mutable_data(ctx.GetPlace()); - + VLOG(3) << "get all inputs"; // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor input_desc; ScopedTensorDescriptor output_desc; @@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); - + VLOG(3) << "create tensor descriptor"; #if CUDNN_VERSION_MIN(7, 0, 1) // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups @@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { cudnn_conv_desc, groups)); groups = 1; #endif - + VLOG(3) << "before create tensor descriptor"; cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( layout, framework::vectorize2int(input->dims()), groups); cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( @@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { output_height = output->dims()[2]; output_width = output->dims()[3]; } - + VLOG(3) << "after create tensor descriptor"; int group_offset_in = input_channels / groups * input_height * input_width * input_depth; int group_offset_out = @@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); + VLOG(3) << "set cudnn algorithm"; CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { cudnn_conv_desc, CUDNN_DEFAULT_MATH)); } #endif - + VLOG(3) << "before get workspace"; // get workspace size able to allocate CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, @@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel { // the limit because the algo is overrided to use tensor core. PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - + VLOG(3) << "after get workspace"; // Allocate on GPU memory platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + workspace_size_in_bytes = 1024; cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + VLOG(3) << "allocate memory"; // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { @@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel { cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_output_desc, output_data + i * group_offset_out)); } + VLOG(3) << "cudnn forward"; // Release the cudnn workspace paddle::memory::Free(gpu, cudnn_workspace); + VLOG(3) << "cudnn pass"; } }; @@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Already on GPU void* cudnn_workspace = nullptr; platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); + workspace_size_in_bytes = 1024; cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 0522a94195786c767194ec727d982a60451e7c62..e2f98164be9bbf0a000c117f102dc6dc14b17657 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase { auto filename = Attr("file_path"); auto load_as_fp16 = Attr("load_as_fp16"); - std::ifstream fin(filename); - PADDLE_ENFORCE(static_cast(fin), + std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary); + PADDLE_ENFORCE(!fin.bad(), "Cannot open file %s for load_combine op", filename); auto out_var_names = Outputs("Out"); @@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase { auto &dev_ctx = *pool.Get(place); for (size_t i = 0; i < out_var_names.size(); i++) { + VLOG(3) << "load " << out_var_names[i]; auto *out_var = scope.FindVar(out_var_names[i]); PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", out_var_names[i]); auto *tensor = out_var->GetMutable(); - + VLOG(3) << "Get Tensor"; // Error checking - PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", + PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s", filename); - + VLOG(3) << "before deserialization"; // Get data from fin to tensor - DeserializeFromStream(fin, tensor, dev_ctx); - + DeserializeFromStream(fin, tensor, dev_ctx); + VLOG(3) << "after deserialization"; auto in_dtype = framework::ToDataType(tensor->type()); auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; @@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase { tensor->set_lod(fp16_tensor.lod()); tensor->ShareDataWith(fp16_tensor); } + VLOG(3) << "load " << out_var_names[i] << " finished"; } } }; diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index bb8b14bb9fa41942c3aa653ca224c0842fbf9a00..b6e15862c16d7b3f3a0b6a1ae8fb2312789a4d21 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_VERSION_MIN(major, minor, patch) \ (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) +#if !defined(_WIN32) #define CUDNN_ENFORCE(condition) \ do { \ cudnnStatus_t status = condition; \ @@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ } \ } while (false) +#else +#define CUDNN_ENFORCE(condition) +#endif enum class DataLayout { // Not use kNHWC,