提交 174c241c 编写于 作者: G Guangda Lai 提交者: TensorFlower Gardener

Transfer the ownership of the calibrated TRT engine to the engine cache, so

we can dump it out for TF 2.0.

PiperOrigin-RevId: 258468798
上级 10387fcd
......@@ -55,6 +55,11 @@ class GetCalibrationDataOp : public OpKernel {
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(0, TensorShape({}), &output));
// Destroy the resource.
OP_REQUIRES_OK(context,
context->resource_manager()->Delete<TRTCalibrationResource>(
std::string(kCalibrationContainerName), resource_name));
output->scalar<string>()() = serialized_resource;
}
};
......
......@@ -101,8 +101,12 @@ class TRTEngineOp : public AsyncOpKernel {
// Allocate necessary resources for calibration
Status AllocateCalibrationResources(OpKernelContext* ctx,
TRTEngineCacheResource* cache_res,
TRTCalibrationResource** cr);
Status GetEngineCacheResource(OpKernelContext* ctx,
TRTEngineCacheResource** cache_res);
// Get engine for the input shape
StatusOr<EngineContext*> GetEngine(
const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx);
......@@ -274,14 +278,19 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
VLOG(1) << "Executing TRT calibration: " << name();
helper->Ref();
core::ScopedUnref sc(helper);
// Get the cache resource outside the LookupOrCreate() below to avoid
// deadlock.
TRTEngineCacheResource* cache_res = nullptr;
OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper);
core::ScopedUnref unref_cache_res(cache_res);
TRTCalibrationResource* calib_res = nullptr;
OP_REQUIRES_OK_ASYNC(
ctx,
ctx->resource_manager()->LookupOrCreate(
std::string(kCalibrationContainerName), name(),
reinterpret_cast<TRTCalibrationResource**>(&calib_res),
{[ctx, this](TRTCalibrationResource** cr) -> Status {
return this->AllocateCalibrationResources(ctx, cr);
{[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status {
return this->AllocateCalibrationResources(ctx, cache_res, cr);
}}),
*helper);
core::ScopedUnref calib_sc(calib_res);
......@@ -555,13 +564,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
return !kRetry;
}
StatusOr<EngineContext*> TRTEngineOp::GetEngine(
const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
static EngineContext empty_context;
mutex_lock lock(engine_mutex_);
// TODO(tmorris): using first input to get batch size - is this reliable?
const int batch_size = input_shapes[0].dim_size(0);
Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
TRTEngineCacheResource** cache_res) {
// Canonicalize the op name by removing the scopes if any. This is mainly
// because in TFv2, the function graph can be instantiated in various ways and
// it'll insert scope names to the name of the TRTEngineOps, which will result
......@@ -575,21 +579,24 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
}
// Get engine cache.
TRTEngineCacheResource* cache_res = nullptr;
auto status = ctx->resource_manager()->LookupOrCreate(
"TF-TRT-Engine-Cache", string(resource_name), &cache_res,
return ctx->resource_manager()->LookupOrCreate(
"TF-TRT-Engine-Cache", string(resource_name), cache_res,
{[this, ctx](TRTEngineCacheResource** cr) -> Status {
*cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
return Status::OK();
}});
if (!status.ok()) {
LOG(WARNING) << "Not able to find or create engine cache for " << name()
<< ". The native segment will be used instead. "
<< "Reason: " << status;
return &empty_context;
}
}
StatusOr<EngineContext*> TRTEngineOp::GetEngine(
const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
static EngineContext empty_context;
TRTEngineCacheResource* cache_res = nullptr;
TF_RETURN_IF_ERROR(GetEngineCacheResource(ctx, &cache_res));
core::ScopedUnref sc(cache_res);
mutex_lock lock(engine_mutex_);
// TODO(tmorris): using first input to get batch size - is this reliable?
const int batch_size = input_shapes[0].dim_size(0);
auto& cache = cache_res->cache_;
auto allocator = cache_res->allocator_.get();
if (allocator == nullptr) {
......@@ -687,23 +694,15 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
return cache.at(engine_input_shapes).get();
}
Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
TRTCalibrationResource** cr) {
Status TRTEngineOp::AllocateCalibrationResources(
OpKernelContext* ctx, TRTEngineCacheResource* cache_res,
TRTCalibrationResource** cr) {
auto cres = new TRTCalibrationResource();
*cr = cres;
// Get the allocator.
auto alloc = ctx->device()->GetAllocator(AllocatorAttributes());
if (!alloc) {
LOG(WARNING) << "Can't get device allocator will not be able to "
"allocate memory from TensorFlow memory pool";
cres->allocator_.reset(new TRTCudaAllocator);
} else {
cres->allocator_.reset(new TRTDeviceAllocator(alloc));
}
// Get the input shapes.
const int batch_size = ctx->input(0).dim_size(0);
const int num_inputs = ctx->num_inputs();
std::vector<PartialTensorShape> shapes;
std::vector<TensorShape> shapes;
cres->device_tensors_.resize(num_inputs);
VLOG(1) << " Constructing calibrator";
for (int i = 0; i < num_inputs; i++) {
......@@ -725,8 +724,6 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
}
cres->calibrator_.reset(
new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
const string label(name());
auto segment_graph = &segment_graph_;
const int platform_gpu_id =
ctx->device()->tensorflow_gpu_device_info()->gpu_id;
if (platform_gpu_id < 0) {
......@@ -734,37 +731,57 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
return errors::InvalidArgument(
"Context->device doesn't contain device info!");
}
const int64 workspace_size_bytes = workspace_size_;
cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
platform_gpu_id, workspace_size_bytes]() {
LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
<< ", Calibration Resource @ " << cres;
auto err = cudaSetDevice(platform_gpu_id);
if (err != cudaSuccess) {
// TODO(aaroey): should return error here.
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
<< " in calibration thread";
}
// ConvertGraphDefToEngine() will try to build the engine. This thread
// will loop inside buildCudaEngine() consuming the calibration data
// that is set by the TF op, and drive the builder until calibrator returns
// false. Engine is discarded after calibration table is generated
//
// TODO(aaroey): maybe setting the max batch size using the python
// calibration wrapper class.
auto s = convert::ConvertGraphDefToEngine(
*segment_graph, TrtPrecisionMode::INT8,
cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes,
&cres->logger_, cres->allocator_.get(), cres->calibrator_.get(),
&cres->engine_,
/*use_calibration=*/true,
/*convert_successfully=*/nullptr);
if (!s.ok()) {
LOG(ERROR) << "Calibration failed: " << s;
cres->calibrator_->setDone(); // Ignore further pushes
}
VLOG(1) << "Calibration loop terminated " << label;
}));
cache_res->Ref();
cres->thr_.reset(
new std::thread([this, cres, shapes, platform_gpu_id, cache_res]() {
core::ScopedUnref sc(cache_res);
LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
<< ", Calibration Resource @ " << cres;
auto err = cudaSetDevice(platform_gpu_id);
if (err != cudaSuccess) {
// TODO(aaroey): should return error here.
LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
<< " in calibration thread";
}
std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
shapes.end());
// ConvertGraphDefToEngine() will try to build the engine. This thread
// will loop inside buildCudaEngine() consuming the calibration data
// that is set by the TF op, and drive the builder until calibrator
// returns false. Engine is discarded after calibration table is
// generated
//
// TODO(aaroey): maybe setting the max batch size using the python
// calibration wrapper class.
auto s = convert::ConvertGraphDefToEngine(
this->segment_graph_, TrtPrecisionMode::INT8,
cres->calibrator_->getBatchSize(), this->workspace_size_,
partial_shapes, &cres->logger_, cache_res->allocator_.get(),
cres->calibrator_.get(), &cres->engine_,
/*use_calibration=*/true,
/*convert_successfully=*/nullptr);
if (!s.ok()) {
LOG(ERROR) << "Calibration failed: " << s;
cres->calibrator_->setDone(); // Ignore further pushes
}
// Transfer the ownership of the engine to the engine cache, so we can
// dump it out during conversion for TF 2.0.
if (cache_res) {
mutex_lock lock(this->engine_mutex_);
cres->SetCalibrationTable();
this->calibrator_ = std::move(cres->calibrator_);
TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
cres->engine_->createExecutionContext());
cache_res->cache_.emplace(
shapes, absl::make_unique<EngineContext>(
std::move(cres->engine_), std::move(exec_context)));
}
VLOG(1) << "Calibration loop terminated " << this->name();
}));
VLOG(1) << "initialized calibrator resource";
return Status::OK();
}
......
......@@ -25,11 +25,6 @@ const absl::string_view kCalibrationContainerName = "TF-TRT-Calibration";
TRTCalibrationResource::~TRTCalibrationResource() {
VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
builder_.reset();
engine_.reset();
// We need to manually destroy the builder and engine before the allocator
// is destroyed.
allocator_.reset();
}
string TRTCalibrationResource::DebugString() const {
......@@ -41,15 +36,18 @@ string TRTCalibrationResource::DebugString() const {
<< " Builder = " << hex << builder_.get() << dec << endl
<< " Engine = " << hex << engine_.get() << dec << endl
<< " Logger = " << hex << &logger_ << dec << endl
<< " Allocator = " << hex << allocator_.get() << dec << endl
<< " Thread = " << hex << thr_.get() << dec << endl;
return oss.str();
}
void TRTCalibrationResource::SetCalibrationTable() {
calibration_table_ = calibrator_->getCalibrationTableAsString();
}
Status TRTCalibrationResource::SerializeToString(string* serialized) {
calibrator_->waitAndSetDone();
thr_->join();
*serialized = calibrator_->getCalibrationTableAsString();
*serialized = calibration_table_;
if (serialized->empty()) {
return errors::Unknown("Calibration table is empty.");
}
......
......@@ -45,6 +45,8 @@ class TRTCalibrationResource : public ResourceBase {
string DebugString() const override;
void SetCalibrationTable();
Status SerializeToString(string* serialized);
// Lookup table for temporary staging areas of input tensors for calibration.
......@@ -53,10 +55,10 @@ class TRTCalibrationResource : public ResourceBase {
// Temporary staging areas for calibration inputs.
std::vector<PersistentTensor> device_tensors_;
string calibration_table_;
std::unique_ptr<TRTInt8Calibrator> calibrator_;
TrtUniquePtrType<nvinfer1::IBuilder> builder_;
TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
std::unique_ptr<TRTBaseAllocator> allocator_;
Logger logger_;
// TODO(sami): Use threadpool threads!
std::unique_ptr<std::thread> thr_;
......
......@@ -58,16 +58,6 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
namespace tensorflow {
namespace tensorrt {
void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
uint32_t flags) {
assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
void* memory;
cudaMalloc(&memory, size);
return memory;
}
void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
uint32_t flags) {
if (size == 0) return nullptr;
......
......@@ -46,16 +46,6 @@ class TRTBaseAllocator : public nvinfer1::IGpuAllocator {
virtual ~TRTBaseAllocator() = default;
};
class TRTCudaAllocator : public TRTBaseAllocator {
// Allocator implementation that is using cuda allocator instead of device
// allocator in case we can't get device allocator from TF.
public:
TRTCudaAllocator() {}
virtual ~TRTCudaAllocator() {}
void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
void free(void* memory) override;
};
class TRTDeviceAllocator : public TRTBaseAllocator {
// Allocator implementation wrapping TF device allocators.
public:
......
......@@ -118,6 +118,7 @@ void TRTInt8Calibrator::waitAndSetDone() {
if (!done_) {
done_ = true;
cond_.notify_all();
dev_buffers_.clear();
}
}
......
......@@ -89,7 +89,7 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
bool done_;
// Map to keep tensorrt input buffers and sizes keyed with buffer names
const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
bool calib_running_;
bool batch_is_set_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册