提交 3c40cb76 编写于 作者: N nhzlx

7 refine zero copy

update trt in  docker file
test=develop
上级 2eff3e26
......@@ -75,7 +75,8 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
RUN wget -qO- https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr
......
......@@ -435,12 +435,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
}
feeds_[idx] = op;
feed_names_[op->Output("Out")[0]] = idx;
idx2feeds_[idx] = op->Output("Out")[0];
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetches_.size() <= static_cast<size_t>(idx)) {
fetches_.resize(idx + 1);
}
fetches_[idx] = op;
idx2fetches_[idx] = op->Input("X")[0];
}
}
}
......@@ -453,6 +455,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
var->GetMutable<framework::FeedFetchList>();
}
std::vector<std::string> AnalysisPredictor::GetInputNames() {
std::vector<std::string> input_names;
for (auto &item : idx2feeds_) {
input_names.push_back(item.second);
}
return input_names;
}
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
std::vector<std::string> output_names;
for (auto &item : idx2fetches_) {
output_names.push_back(item.second);
}
return output_names;
}
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
const std::string &name) {
PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
......@@ -460,6 +478,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = true;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
......@@ -470,6 +495,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
res->input_or_output_ = false;
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else {
auto gpu_place = boost::get<platform::CUDAPlace>(place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
}
return res;
}
......
......@@ -55,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override;
std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
......@@ -133,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
// Sorted according to the idx.
std::map<size_t, std::string> idx2feeds_;
std::vector<framework::OpDesc *> fetches_;
std::map<size_t, std::string> idx2fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
return res;
}
template <typename T>
void ZeroCopyTensor::copy_from_cpu(const T *data) {
EAGER_GET_TENSOR;
PADDLE_ENFORCE_GE(
tensor->numel(), 0,
"You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
"function before copy data from cpu.");
size_t ele_size = tensor->numel() * sizeof(T);
if (place_ == PaddlePlace::kCPU) {
auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
std::memcpy(static_cast<void *>(t_data), data, ele_size);
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
platform::CUDAPlace gpu_place(device_);
auto *t_data = tensor->mutable_data<T>(gpu_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
data, ele_size, dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template <typename T>
void ZeroCopyTensor::copy_to_cpu(T *data) {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
auto t_place = tensor->place();
if (platform::is_cpu_place(t_place)) {
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
} else {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
auto *dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
t_data, ele_num * sizeof(T), dev_ctx->stream());
#else
PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
}
}
template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
int *size) const;
template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
......@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
return tensor;
}
std::vector<int64_t> ZeroCopyTensor::shape() const {
std::vector<int> ZeroCopyTensor::shape() const {
EAGER_GET_TENSOR;
PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
return framework::vectorize(tensor->dims());
return framework::vectorize2int(tensor->dims());
}
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
......
......@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
void *ZeroCopyTensor::FindTensor() const { return nullptr; }
std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
std::vector<int> ZeroCopyTensor::shape() const { return {}; }
void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
......
......@@ -160,11 +160,21 @@ class ZeroCopyTensor {
template <typename T>
T* data(PaddlePlace* place, int* size) const;
std::vector<int64_t> shape() const;
template <typename T>
void copy_from_cpu(const T* data);
template <typename T>
void copy_to_cpu(T* data);
std::vector<int> shape() const;
void SetLoD(const std::vector<std::vector<size_t>>& x);
std::vector<std::vector<size_t>> lod() const;
const std::string& name() const { return name_; }
void SetPlace(PaddlePlace place, int device = -1) {
place_ = place;
device_ = device;
}
protected:
explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
......@@ -179,6 +189,8 @@ class ZeroCopyTensor {
// The corresponding tensor pointer inside Paddle workspace is cached for
// performance.
mutable void* tensor_{nullptr};
PaddlePlace place_;
int device_;
};
/** A simple Inference API for Paddle.
......@@ -200,6 +212,14 @@ class PaddlePredictor {
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
/** \brief Get input names of the model
*/
virtual std::vector<std::string> GetInputNames() { return {}; }
/** \brief Get output names of the model
*/
virtual std::vector<std::string> GetOutputNames() { return {}; }
/** \brief Get a mutable tensor directly.
*
* NOTE Only works in AnalysisPredictor.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册