提交 e41a3fcd 编写于 作者: D dzhwinter

fix update to develop hang problem.

上级 804dd7da
...@@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim) ...@@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) # // if (WIN32)
windows_symbolic(tensor_util SRCS tensor_util.cu) # // windows_symbolic(tensor_util SRCS tensor_util.cu)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) # // nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util) # // add_dependencies(tensor tensor_util)
else() # // else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
endif(WIN32) # endif(WIN32)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
endif() endif()
...@@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu ...@@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function) DEPS operator op_registry device_context math_function)
if(WITH_GPU) if(WITH_GPU)
if (WIN32) # if (WIN32)
# windows treat symbolic file as a real file, which is different with unix # # windows treat symbolic file as a real file, which is different with unix
# We create a hidden file and compile it instead of origin source file. # # We create a hidden file and compile it instead of origin source file.
windows_symbolic(hidden_file SRCS data_type_transform.cu) # windows_symbolic(hidden_file SRCS data_type_transform.cu)
nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) # nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
add_dependencies(data_type_transform hidden_file) # add_dependencies(data_type_transform hidden_file)
else() # else()
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32) # endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else() else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
// You may obtain a copy of the License at You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and See the License for the specific language governing permissions and
// limitations under the License. limitations under the License. */
data_type_transform.cc #include "paddle/fluid/framework/data_type_transform.h"
\ No newline at end of file
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace framework {
template <typename InType, typename OutType>
struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const {
return static_cast<OutType>(in);
}
};
template <typename InType>
struct CastDataType {
CastDataType(const framework::Tensor& in, framework::Tensor* out,
const platform::DeviceContext* ctx)
: in_(in), out_(out), ctx_(ctx) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
template <typename OutType>
void apply() {
auto* in_begin = in_.data<InType>();
auto* in_end = in_begin + in_.numel();
auto* out_begin = out_->mutable_data<OutType>(in_.place());
if (platform::is_cpu_place(in_.place())) {
platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
context->Wait();
#endif
} else {
PADDLE_THROW("Unsupported place!");
}
}
};
void TransDataType(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const Tensor& in,
Tensor* out) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
out->Resize(in.dims());
auto src_type = kernel_type_for_var.data_type_;
auto dst_type = expected_kernel_type.data_type_;
auto ctx = pool.Get(in.place());
switch (src_type) {
case proto::VarType::FP16:
framework::VisitDataType(dst_type,
CastDataType<platform::float16>(in, out, ctx));
break;
case proto::VarType::FP32:
framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
break;
case proto::VarType::FP64:
framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
break;
case proto::VarType::INT32:
framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
break;
case proto::VarType::INT64:
framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
break;
case proto::VarType::BOOL:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
case proto::VarType::INT16:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
case proto::VarType::UINT8:
framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
break;
default:
PADDLE_THROW("Not support type %d", src_type);
}
}
} // namespace framework
} // namespace paddle
...@@ -17,8 +17,11 @@ limitations under the License. */ ...@@ -17,8 +17,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
#if !defined(_WIN32)
constexpr char Node::kControlDepVarName[]; constexpr char Node::kControlDepVarName[] = "__control_var";
#else
const char Node::kControlDepVarName[] = "__control_var";
#endif
int Node::count_ = 0; int Node::count_ = 0;
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
......
...@@ -27,7 +27,11 @@ namespace ir { ...@@ -27,7 +27,11 @@ namespace ir {
class Node { class Node {
public: public:
enum class Type { kOperation, kVariable }; enum class Type { kOperation, kVariable };
#if !defined(_WIN32) // msvc not support constexpr correctly.
static constexpr char kControlDepVarName[] = "__control_var"; static constexpr char kControlDepVarName[] = "__control_var";
#else
static const char kControlDepVarName[];
#endif
explicit Node(const std::string& name, Type type) explicit Node(const std::string& name, Type type)
: name_(name), : name_(name),
......
...@@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto expected_kernel_key = auto expected_kernel_key =
this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key: " << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
// You may obtain a copy of the License at You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and See the License for the specific language governing permissions and
// limitations under the License. limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
tensor_util.cc #include <algorithm>
\ No newline at end of file #include <limits>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace paddle {
namespace framework {
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) {
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
if (platform::is_same_place(ctx_place, src_place)) {
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
} else if (platform::is_same_place(ctx_place, dst_place)) {
platform::DeviceContextPool::Instance().Get(src.place())->Wait();
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
stream);
} else {
PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
}
}
}
#endif
}
void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(dst_place)) {
dev_ctx = pool.Get(dst_place);
} else {
dev_ctx = pool.Get(src.place());
}
TensorCopy(src, dst_place, *dev_ctx, dst);
}
void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) {
VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
<< " to " << dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(dst_place, src.type());
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
} else if (platform::is_gpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
}
#endif
}
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
const Tensor& tensor_;
const DevCtx& ctx_;
Tensor* out_;
AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
Tensor* out)
: predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
template <typename T>
void apply() const {
auto t = EigenVector<T>::Flatten(tensor_);
auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
o.device(*ctx_.eigen_device()) = predicate_(t).any();
}
};
template <typename Predicate, typename DevCtx>
inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
const DevCtx& ctx, framework::Tensor* out) {
VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
predicate, tensor, ctx, out));
}
template <typename Predicate>
struct AnyVisitor : public boost::static_visitor<bool> {
const framework::Tensor& tensor_;
Predicate predicate_;
AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
template <typename Place>
bool operator()(const Place& place) const {
framework::Tensor out;
out.Resize({1});
out.mutable_data<bool>(place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
AnyImpl(predicate_, tensor_, *ctx, &out);
return this->GetResult(out, place);
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPlace& gpu) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
TensorCopy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPinnedPlace& cpu) const {
return *out.data<bool>();
}
};
template <typename Predicate>
inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
AnyVisitor<Predicate> visitor(tensor, predicate);
auto place = tensor.place();
return platform::VisitPlace(place, visitor);
}
struct ContainsNANPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isnan();
}
};
bool TensorContainsNAN(const framework::Tensor& tensor) {
ContainsNANPredicate predicate;
return Any(tensor, predicate);
}
struct ContainsInfPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isinf();
}
};
bool TensorContainsInf(const framework::Tensor& tensor) {
ContainsInfPredicate predicate;
return Any(tensor, predicate);
}
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::VarType::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto* pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
static_cast<std::streamsize>(size));
}
}
}
struct DeserializedDataFunctor {
DeserializedDataFunctor(void** buf, Tensor* tensor,
const platform::Place& place)
: buf_(buf), tensor_(tensor), place_(place) {}
template <typename T>
void apply() {
*buf_ = tensor_->mutable_data<T>(place_);
}
void** buf_;
Tensor* tensor_;
platform::Place place_;
};
void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx) {
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::VarType::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
size_t size =
tensor->numel() *
framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), size);
}
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
You may obtain a copy of the License at // You may obtain a copy of the License at
//
http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
//
Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
limitations under the License. */ // limitations under the License.
/* #include <cassert>
* This file contains a simple demo for how to take a model for inference. #include <chrono>
*/ #include <iostream>
#include <cassert> #include <fstream>
#include <cctype> #include <algorithm>
#include <vector>
#include <algorithm> #include <string>
#include <fstream>
#include <iostream> #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include <iterator>
#include <memory> namespace paddle {
#include <sstream>
#include <string> std::string DIRNAME = "./Release/infer_model";
#include <thread> //NOLINT std::string DATA = "./test-image.txt";
#include "paddle/fluid/inference/paddle_inference_api.h" const int C = 3; // image channel
const int H = 449; // image height
std::string MODELDIR = ""; /* "Directory of the inference model." */ // NOLINT const int W = 581; // image width
std::string REFER = ""; // 数据格式
/*"path to reference result for comparison."*/ //NOTLINT // "<space splitted floats as data>\t<space splitted ints as shape"
/*path of data; each line is a record, format: // 1. 存储为float32格式。
<space splitted floats as data>\t<space splitted ints as shape> // 2. 必须减去均值。 CHW三个通道为 mean = 112.15, 109.41, 185.42
Please check the demo data of data.txt for details. struct Record
*/ {
std::string DATA = ""; std::vector<float> data;
bool USE_GPU = true; /*"Whether use gpu."*/ std::vector<int32_t> shape;
};
auto message_err = []()
{ NativeConfig GetConfig() {
std::cout << "Copyright (c) 2018 PaddlePaddle Authors." << std::endl; NativeConfig config;
std::cout << "Demo Case for windows inference. " config.prog_file=DIRNAME + "/__model__";
<< "\n" config.param_file=DIRNAME + "/__params__";
<< "Usage: Input your model path and use_gpu as the guide requires," config.fraction_of_gpu_memory = 0.0;
<< "then run the demo inference, and will get a result." config.use_gpu = true;
<< std::endl; config.device = 0;
std::cout << std::endl; return config;
}; }
namespace paddle using Time = decltype(std::chrono::high_resolution_clock::now());
{
namespace demo Time time() { return std::chrono::high_resolution_clock::now(); };
{
void split(const std::string& str, char sep, double time_diff(Time t1, Time t2) {
std::vector<std::string>* pieces) typedef std::chrono::microseconds ms;
{ auto diff = t2 - t1;
pieces->clear(); ms counter = std::chrono::duration_cast<ms>(diff);
if (str.empty()) return counter.count() / 1000.0;
{ }
return;
} static void split(const std::string& str, char sep,
size_t pos = 0; std::vector<std::string>* pieces) {
size_t next = str.find(sep, pos); pieces->clear();
while (next != std::string::npos) if (str.empty()) {
{ return;
pieces->push_back(str.substr(pos, next - pos)); }
pos = next + 1; size_t pos = 0;
next = str.find(sep, pos); size_t next = str.find(sep, pos);
} while (next != std::string::npos) {
if (!str.substr(pos).empty()) pieces->push_back(str.substr(pos, next - pos));
{ pos = next + 1;
pieces->push_back(str.substr(pos)); next = str.find(sep, pos);
} }
} if (!str.substr(pos).empty()) {
pieces->push_back(str.substr(pos));
/* }
* Get a summary of a PaddleTensor content. }
*/
std::string SummaryTensor(const PaddleTensor& tensor) Record ProcessALine(const std::string& line) {
{ std::vector<std::string> columns;
std::stringstream ss; split(line, '\t', &columns);
int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
Record record;
ss << "data[:10]\t"; std::vector<std::string> data_strs;
switch (tensor.dtype) split(columns[0], ' ', &data_strs);
{ for (auto& d : data_strs) {
case PaddleDType::INT64: record.data.push_back(std::stof(d));
for (int i = 0; i < std::min(num_elems, 10); i++) }
{
ss << static_cast<int64_t*>(tensor.data.data())[i] << " "; std::vector<std::string> shape_strs;
} split(columns[1], ' ', &shape_strs);
break; for (auto& s : shape_strs) {
case PaddleDType::FLOAT32: record.shape.push_back(std::stoi(s));
for (int i = 0; i < std::min(num_elems, 10); i++) }
{ return record;
ss << static_cast<float*>(tensor.data.data())[i] << " "; }
}
break; void test_naive(int batch_size){
} NativeConfig config = GetConfig();
return ss.str(); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
} int height = H;
int width = W;
std::string ToString(const NativeConfig& config) int channel = C;
{ int num_sum = height * width * channel * batch_size;
std::stringstream ss;
ss << "Use GPU : " << (config.use_gpu ? "True" : "False") << "\n" // 1. use fake data
<< "Device : " << config.device << "\n" std::vector<float> data;
<< "fraction_of_gpu_memory : " << config.fraction_of_gpu_memory << "\n" for(int i = 0; i < num_sum; i++) {
<< "specify_input_name : " data.push_back(0.0);
<< (config.specify_input_name ? "True" : "False") << "\n" }
<< "Program File : " << config.prog_file << "\n"
<< "Param File : " << config.param_file; PaddleTensor tensor;
return ss.str(); tensor.shape = std::vector<int>({batch_size, channel, height, width});
} tensor.data.Resize(sizeof(float) * batch_size * channel * height * width);
std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
struct Record tensor.dtype = PaddleDType::FLOAT32;
{
std::vector<float> data; // 2. read data from file
std::vector<int32_t> shape; // std::string line;
}; // std::ifstream file(DATA);
// std::getline(file, line);
Record ProcessALine(const std::string& line) // auto record = ProcessALine(line);
{ // file.close();
std::cout << "process a line" << std::endl; // PaddleTensor tensor;
std::vector<std::string> columns; // tensor.shape = record.shape;
split(line, '\t', &columns); // tensor.data =
assert(columns.size() == 2UL, "data format error, should be <data>\t<shape>"); // PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
Record record; std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
std::vector<std::string> data_strs; PaddleTensor tensor_out;
split(columns[0], ' ', &data_strs);
//将数据字符串转换为整型数据并放到record.data中 std::vector<PaddleTensor> outputs(1, tensor_out);
for (auto& d : data_strs)
{ predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
record.data.push_back(std::stof(d)); auto time1 = time();
}
for(size_t i = 0; i < 2; i++) {
std::vector<std::string> shape_strs; std::cout << "Pass " << i << "predict";
split(columns[1], ' ', &shape_strs); predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
for (auto& s : shape_strs) }
{
record.shape.push_back(std::stoi(s)); auto time2 = time();
} std::ofstream ofresult("naive_test_result.txt", std::ios::app);
std::cout << "data size " << record.data.size() << std::endl;
std::cout << "data shape size " << record.shape.size() << std::endl; std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
return record; std::cout << outputs.size() << std::endl;
}
}
void CheckOutput(const std::string& referfile, const PaddleTensor& output) } // namespace paddle
{
std::string line; int main(int argc, char** argv) {
std::ifstream file(referfile); paddle::test_naive(1 << 0);
std::getline(file, line); return 0;
auto refer = ProcessALine(line); }
file.close(); \ No newline at end of file
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
std::cout << "predictor output numel " << numel << std::endl;
std::cout << "reference output numel " << refer.data.size() << std::endl;
assert(numel == refer.data.size());
switch (output.dtype)
{
case PaddleDType::INT64:
for (size_t i = 0; i < numel; ++i)
{
assert(static_cast<int64_t*>(output.data.data())[i] == refer.data[i]);
}
break;
case PaddleDType::FLOAT32:
for (size_t i = 0; i < numel; ++i)
{
assert(fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]) <= 1e-5);
}
break;
}
}
/*
* Use the native fluid engine to inference the demo.
*/
void Main(bool use_gpu)
{
NativeConfig config;
config.model_dir = MODELDIR;
//config.param_file = MODELDIR + "/__params__";
//config.prog_file = MODELDIR + "/__model__";
config.use_gpu = USE_GPU;
config.device = 0;
if (USE_GPU)
{
config.fraction_of_gpu_memory = 0.1f; // set by yourself
}
std::cout << ToString(config) << std::endl;
std::cout << "init predictor" << std::endl;
auto predictor = CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
std::cout << "begin to process data" << std::endl;
// Just a single batch of data.
std::string line;
std::cout << "data : " << std::endl;
std::ifstream file(DATA);
if (!file.is_open())
{
std::cout << "failed open data" << DATA << std::endl;
exit(0);
}
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input;
input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
std::cout << "run executor" << std::endl;
std::vector<PaddleTensor> output;
predictor->Run({ input }, &output);
std::cout << "output.size " << output.size() << std::endl;
auto& tensor = output.front();
std::cout << "output: " << SummaryTensor(tensor) << std::endl;
// compare with reference result
std::cout << "refer result : " << REFER << std::endl;
CheckOutput(REFER, tensor);
}
}
}
int main(int argc, char** argv)
{
MODELDIR = "./LB_icnet_model";
//DATA = "./icnet_image.txt";
DATA = "./1.png.txt";
REFER = "./icnet_label.txt";
paddle::demo::Main(USE_GPU);
system("pause");
return 0;
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <iostream>
#include <fstream>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
std::string DIRNAME = "./LB_icnet_model";
//std::string DIRNAME = "./infer_models";
NativeConfig GetConfig() {
NativeConfig config;
config.prog_file=DIRNAME + "/__model__";
config.param_file=DIRNAME + "/__params__";
config.fraction_of_gpu_memory = 0.8;
config.use_gpu = true;
config.device = 0;
return config;
}
using Time = decltype(std::chrono::high_resolution_clock::now());
Time time() { return std::chrono::high_resolution_clock::now(); };
double time_diff(Time t1, Time t2) {
typedef std::chrono::microseconds ms;
auto diff = t2 - t1;
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
void test_naive(int batch_size){
NativeConfig config = GetConfig();
// config.model_dir = model_path;
auto predictor = CreatePaddlePredictor<NativeConfig>(config);
int height = 449;
int width = 581;
//int height = 3;
//int width = 3;
int num_sum = height * width * 3 * batch_size;
std::vector<float> data;
for(int i = 0; i < num_sum; i++) {
data.push_back(0.0);
}
PaddleTensor tensor;
tensor.shape = std::vector<int>({batch_size, 3, height, width});
tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
tensor.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
PaddleTensor tensor_out;
std::vector<PaddleTensor> outputs(1, tensor_out);
predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
std::cout << "start predict123:" << std::endl;
auto time1 = time();
for(size_t i = 0; i < 2; i++) {
predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
std::cout << "pass " << i;
}
auto time2 = time();
std::ofstream ofresult("naive_test_result.txt", std::ios::app);
std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
std::cout << outputs.size() << std::endl;
/*
int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
ofresult << std::to_string(data_o[j]) << " ";
}
ofresult << std::endl;
ofresult.close();
*/
}
} // namespace paddle
int main(int argc, char** argv) {
paddle::test_naive(1 << 0);
return 0;
}
\ No newline at end of file
...@@ -43,6 +43,7 @@ template <typename T> ...@@ -43,6 +43,7 @@ template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> { class CUDNNConvOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
VLOG(3) << "inside cudnn";
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use CUDAPlace."); "It must use CUDAPlace.");
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
...@@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
VLOG(3) << "get all inputs";
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc; ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc; ScopedTensorDescriptor output_desc;
...@@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
VLOG(3) << "create tensor descriptor";
#if CUDNN_VERSION_MIN(7, 0, 1) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
...@@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, groups)); cudnn_conv_desc, groups));
groups = 1; groups = 1;
#endif #endif
VLOG(3) << "before create tensor descriptor";
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()), groups); layout, framework::vectorize2int(input->dims()), groups);
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>( cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
...@@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_height = output->dims()[2]; output_height = output->dims()[2];
output_width = output->dims()[3]; output_width = output->dims()[3];
} }
VLOG(3) << "after create tensor descriptor";
int group_offset_in = int group_offset_in =
input_channels / groups * input_height * input_width * input_depth; input_channels / groups * input_height * input_width * input_depth;
int group_offset_out = int group_offset_out =
...@@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
VLOG(3) << "set cudnn algorithm";
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
...@@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, CUDNN_DEFAULT_MATH)); cudnn_conv_desc, CUDNN_DEFAULT_MATH));
} }
#endif #endif
VLOG(3) << "before get workspace";
// get workspace size able to allocate // get workspace size able to allocate
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
...@@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// the limit because the algo is overrided to use tensor core. // the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
VLOG(3) << "after get workspace";
// Allocate on GPU memory // Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
VLOG(3) << "allocate memory";
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
...@@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); &beta, cudnn_output_desc, output_data + i * group_offset_out));
} }
VLOG(3) << "cudnn forward";
// Release the cudnn workspace // Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace); paddle::memory::Free(gpu, cudnn_workspace);
VLOG(3) << "cudnn pass";
} }
}; };
...@@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Already on GPU // Already on GPU
void* cudnn_workspace = nullptr; void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
workspace_size_in_bytes = 1024;
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
......
...@@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto load_as_fp16 = Attr<bool>("load_as_fp16"); auto load_as_fp16 = Attr<bool>("load_as_fp16");
std::ifstream fin(filename); std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), PADDLE_ENFORCE(!fin.bad(),
"Cannot open file %s for load_combine op", filename); "Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out"); auto out_var_names = Outputs("Out");
...@@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) { for (size_t i = 0; i < out_var_names.size(); i++) {
VLOG(3) << "load " << out_var_names[i];
auto *out_var = scope.FindVar(out_var_names[i]); auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found", PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]); out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
VLOG(3) << "Get Tensor";
// Error checking // Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s", PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
filename); filename);
VLOG(3) << "before deserialization";
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx); DeserializeFromStream(fin, tensor, dev_ctx);
VLOG(3) << "after deserialization";
auto in_dtype = framework::ToDataType(tensor->type()); auto in_dtype = framework::ToDataType(tensor->type());
auto out_dtype = auto out_dtype =
load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
...@@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
tensor->set_lod(fp16_tensor.lod()); tensor->set_lod(fp16_tensor.lod());
tensor->ShareDataWith(fp16_tensor); tensor->ShareDataWith(fp16_tensor);
} }
VLOG(3) << "load " << out_var_names[i] << " finished";
} }
} }
}; };
......
...@@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { ...@@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_VERSION_MIN(major, minor, patch) \ #define CUDNN_VERSION_MIN(major, minor, patch) \
(CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
#if !defined(_WIN32)
#define CUDNN_ENFORCE(condition) \ #define CUDNN_ENFORCE(condition) \
do { \ do { \
cudnnStatus_t status = condition; \ cudnnStatus_t status = condition; \
...@@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { ...@@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \ } \
} while (false) } while (false)
#else
#define CUDNN_ENFORCE(condition)
#endif
enum class DataLayout { // Not use enum class DataLayout { // Not use
kNHWC, kNHWC,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册