Transfer the ownership of the calibrated TRT engine to the engine cache, so

we can dump it out for TF 2.0. PiperOrigin-RevId: 258468798

Transfer the ownership of the calibrated TRT engine to the engine cache, so
we can dump it out for TF 2.0. PiperOrigin-RevId: 258468798
174c241c · Guangda Lai · TensorFlower Gardener · 10387fcd · 174c241c · 174c241c
8 changed file
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -55,6 +55,11 @@ class GetCalibrationDataOp : public OpKernel {
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, TensorShape({}), &output));
+
+    // Destroy the resource.
+    OP_REQUIRES_OK(context,
+                   context->resource_manager()->Delete<TRTCalibrationResource>(
+                       std::string(kCalibrationContainerName), resource_name));
    output->scalar<string>()() = serialized_resource;
  }
 };

--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -101,8 +101,12 @@ class TRTEngineOp : public AsyncOpKernel {

  // Allocate necessary resources for calibration
  Status AllocateCalibrationResources(OpKernelContext* ctx,
+                                      TRTEngineCacheResource* cache_res,
                                      TRTCalibrationResource** cr);

+  Status GetEngineCacheResource(OpKernelContext* ctx,
+                                TRTEngineCacheResource** cache_res);
+
  // Get engine for the input shape
  StatusOr<EngineContext*> GetEngine(
      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx);
@@ -274,14 +278,19 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
  VLOG(1) << "Executing TRT calibration: " << name();
  helper->Ref();
  core::ScopedUnref sc(helper);
+  // Get the cache resource outside the LookupOrCreate() below to avoid
+  // deadlock.
+  TRTEngineCacheResource* cache_res = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper);
+  core::ScopedUnref unref_cache_res(cache_res);
  TRTCalibrationResource* calib_res = nullptr;
  OP_REQUIRES_OK_ASYNC(
      ctx,
      ctx->resource_manager()->LookupOrCreate(
          std::string(kCalibrationContainerName), name(),
          reinterpret_cast<TRTCalibrationResource**>(&calib_res),
-          {[ctx, this](TRTCalibrationResource** cr) -> Status {
-            return this->AllocateCalibrationResources(ctx, cr);
+          {[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status {
+            return this->AllocateCalibrationResources(ctx, cache_res, cr);
          }}),
      *helper);
  core::ScopedUnref calib_sc(calib_res);
@@ -555,13 +564,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
  return !kRetry;
 }

-StatusOr<EngineContext*> TRTEngineOp::GetEngine(
-    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
-  static EngineContext empty_context;
-  mutex_lock lock(engine_mutex_);
-  // TODO(tmorris): using first input to get batch size - is this reliable?
-  const int batch_size = input_shapes[0].dim_size(0);
-
+Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
+                                           TRTEngineCacheResource** cache_res) {
  // Canonicalize the op name by removing the scopes if any. This is mainly
  // because in TFv2, the function graph can be instantiated in various ways and
  // it'll insert scope names to the name of the TRTEngineOps, which will result
@@ -575,21 +579,24 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
  }

  // Get engine cache.
-  TRTEngineCacheResource* cache_res = nullptr;
-  auto status = ctx->resource_manager()->LookupOrCreate(
-      "TF-TRT-Engine-Cache", string(resource_name), &cache_res,
+  return ctx->resource_manager()->LookupOrCreate(
+      "TF-TRT-Engine-Cache", string(resource_name), cache_res,
      {[this, ctx](TRTEngineCacheResource** cr) -> Status {
        *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
        return Status::OK();
      }});
-  if (!status.ok()) {
-    LOG(WARNING) << "Not able to find or create engine cache for " << name()
-                 << ". The native segment will be used instead. "
-                 << "Reason: " << status;
-    return &empty_context;
-  }
+}

+StatusOr<EngineContext*> TRTEngineOp::GetEngine(
+    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
+  static EngineContext empty_context;
+  TRTEngineCacheResource* cache_res = nullptr;
+  TF_RETURN_IF_ERROR(GetEngineCacheResource(ctx, &cache_res));
  core::ScopedUnref sc(cache_res);
+
+  mutex_lock lock(engine_mutex_);
+  // TODO(tmorris): using first input to get batch size - is this reliable?
+  const int batch_size = input_shapes[0].dim_size(0);
  auto& cache = cache_res->cache_;
  auto allocator = cache_res->allocator_.get();
  if (allocator == nullptr) {
@@ -687,23 +694,15 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
  return cache.at(engine_input_shapes).get();
 }

-Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
-                                                 TRTCalibrationResource** cr) {
+Status TRTEngineOp::AllocateCalibrationResources(
+    OpKernelContext* ctx, TRTEngineCacheResource* cache_res,
+    TRTCalibrationResource** cr) {
  auto cres = new TRTCalibrationResource();
  *cr = cres;
-  // Get the allocator.
-  auto alloc = ctx->device()->GetAllocator(AllocatorAttributes());
-  if (!alloc) {
-    LOG(WARNING) << "Can't get device allocator will not be able to "
-                    "allocate memory from TensorFlow memory pool";
-    cres->allocator_.reset(new TRTCudaAllocator);
-  } else {
-    cres->allocator_.reset(new TRTDeviceAllocator(alloc));
-  }
  // Get the input shapes.
  const int batch_size = ctx->input(0).dim_size(0);
  const int num_inputs = ctx->num_inputs();
-  std::vector<PartialTensorShape> shapes;
+  std::vector<TensorShape> shapes;
  cres->device_tensors_.resize(num_inputs);
  VLOG(1) << " Constructing calibrator";
  for (int i = 0; i < num_inputs; i++) {
@@ -725,8 +724,6 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
  }
  cres->calibrator_.reset(
      new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
-  const string label(name());
-  auto segment_graph = &segment_graph_;
  const int platform_gpu_id =
      ctx->device()->tensorflow_gpu_device_info()->gpu_id;
  if (platform_gpu_id < 0) {
@@ -734,37 +731,57 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx,
    return errors::InvalidArgument(
        "Context->device doesn't contain device info!");
  }
-  const int64 workspace_size_bytes = workspace_size_;
-  cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
-                                    platform_gpu_id, workspace_size_bytes]() {
-    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
-              << ", Calibration Resource @ " << cres;
-    auto err = cudaSetDevice(platform_gpu_id);
-    if (err != cudaSuccess) {
-      // TODO(aaroey): should return error here.
-      LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
-                 << " in calibration thread";
-    }
-    // ConvertGraphDefToEngine() will try to build the engine. This thread
-    // will loop inside buildCudaEngine() consuming the calibration data
-    // that is set by the TF op, and drive the builder until calibrator returns
-    // false. Engine is discarded after calibration table is generated
-    //
-    // TODO(aaroey): maybe setting the max batch size using the python
-    // calibration wrapper class.
-    auto s = convert::ConvertGraphDefToEngine(
-        *segment_graph, TrtPrecisionMode::INT8,
-        cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes,
-        &cres->logger_, cres->allocator_.get(), cres->calibrator_.get(),
-        &cres->engine_,
-        /*use_calibration=*/true,
-        /*convert_successfully=*/nullptr);
-    if (!s.ok()) {
-      LOG(ERROR) << "Calibration failed: " << s;
-      cres->calibrator_->setDone();  // Ignore further pushes
-    }
-    VLOG(1) << "Calibration loop terminated " << label;
-  }));
+
+  cache_res->Ref();
+  cres->thr_.reset(
+      new std::thread([this, cres, shapes, platform_gpu_id, cache_res]() {
+        core::ScopedUnref sc(cache_res);
+
+        LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
+                  << ", Calibration Resource @ " << cres;
+        auto err = cudaSetDevice(platform_gpu_id);
+        if (err != cudaSuccess) {
+          // TODO(aaroey): should return error here.
+          LOG(ERROR) << "Couldn't set cuda device to " << platform_gpu_id
+                     << " in calibration thread";
+        }
+        std::vector<PartialTensorShape> partial_shapes(shapes.begin(),
+                                                       shapes.end());
+        // ConvertGraphDefToEngine() will try to build the engine. This thread
+        // will loop inside buildCudaEngine() consuming the calibration data
+        // that is set by the TF op, and drive the builder until calibrator
+        // returns false. Engine is discarded after calibration table is
+        // generated
+        //
+        // TODO(aaroey): maybe setting the max batch size using the python
+        // calibration wrapper class.
+        auto s = convert::ConvertGraphDefToEngine(
+            this->segment_graph_, TrtPrecisionMode::INT8,
+            cres->calibrator_->getBatchSize(), this->workspace_size_,
+            partial_shapes, &cres->logger_, cache_res->allocator_.get(),
+            cres->calibrator_.get(), &cres->engine_,
+            /*use_calibration=*/true,
+            /*convert_successfully=*/nullptr);
+        if (!s.ok()) {
+          LOG(ERROR) << "Calibration failed: " << s;
+          cres->calibrator_->setDone();  // Ignore further pushes
+        }
+
+        // Transfer the ownership of the engine to the engine cache, so we can
+        // dump it out during conversion for TF 2.0.
+        if (cache_res) {
+          mutex_lock lock(this->engine_mutex_);
+          cres->SetCalibrationTable();
+          this->calibrator_ = std::move(cres->calibrator_);
+          TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+              cres->engine_->createExecutionContext());
+          cache_res->cache_.emplace(
+              shapes, absl::make_unique<EngineContext>(
+                          std::move(cres->engine_), std::move(exec_context)));
+        }
+
+        VLOG(1) << "Calibration loop terminated " << this->name();
+      }));
  VLOG(1) << "initialized calibrator resource";
  return Status::OK();
 }

--- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc
@@ -25,11 +25,6 @@ const absl::string_view kCalibrationContainerName = "TF-TRT-Calibration";

 TRTCalibrationResource::~TRTCalibrationResource() {
  VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-  builder_.reset();
-  engine_.reset();
-  // We need to manually destroy the builder and engine before the allocator
-  // is destroyed.
-  allocator_.reset();
 }

 string TRTCalibrationResource::DebugString() const {
@@ -41,15 +36,18 @@ string TRTCalibrationResource::DebugString() const {
      << " Builder    = " << hex << builder_.get() << dec << endl
      << " Engine     = " << hex << engine_.get() << dec << endl
      << " Logger     = " << hex << &logger_ << dec << endl
-      << " Allocator  = " << hex << allocator_.get() << dec << endl
      << " Thread     = " << hex << thr_.get() << dec << endl;
  return oss.str();
 }

+void TRTCalibrationResource::SetCalibrationTable() {
+  calibration_table_ = calibrator_->getCalibrationTableAsString();
+}
+
 Status TRTCalibrationResource::SerializeToString(string* serialized) {
  calibrator_->waitAndSetDone();
  thr_->join();
-  *serialized = calibrator_->getCalibrationTableAsString();
+  *serialized = calibration_table_;
  if (serialized->empty()) {
    return errors::Unknown("Calibration table is empty.");
  }

--- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h
@@ -45,6 +45,8 @@ class TRTCalibrationResource : public ResourceBase {

  string DebugString() const override;

+  void SetCalibrationTable();
+
  Status SerializeToString(string* serialized);

  // Lookup table for temporary staging areas of input tensors for calibration.
@@ -53,10 +55,10 @@ class TRTCalibrationResource : public ResourceBase {
  // Temporary staging areas for calibration inputs.
  std::vector<PersistentTensor> device_tensors_;

+  string calibration_table_;
  std::unique_ptr<TRTInt8Calibrator> calibrator_;
  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
-  std::unique_ptr<TRTBaseAllocator> allocator_;
  Logger logger_;
  // TODO(sami): Use threadpool threads!
  std::unique_ptr<std::thread> thr_;

--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -58,16 +58,6 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 namespace tensorflow {
 namespace tensorrt {

-void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
-                                 uint32_t flags) {
-  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
-  void* memory;
-  cudaMalloc(&memory, size);
-  return memory;
-}
-
-void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
-
 void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
                                   uint32_t flags) {
  if (size == 0) return nullptr;

--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -46,16 +46,6 @@ class TRTBaseAllocator : public nvinfer1::IGpuAllocator {
  virtual ~TRTBaseAllocator() = default;
 };

-class TRTCudaAllocator : public TRTBaseAllocator {
-  // Allocator implementation that is using cuda allocator instead of device
-  // allocator in case we can't get device allocator from TF.
- public:
-  TRTCudaAllocator() {}
-  virtual ~TRTCudaAllocator() {}
-  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
-  void free(void* memory) override;
-};
-
 class TRTDeviceAllocator : public TRTBaseAllocator {
  // Allocator implementation wrapping TF device allocators.
 public:

--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -118,6 +118,7 @@ void TRTInt8Calibrator::waitAndSetDone() {
  if (!done_) {
    done_ = true;
    cond_.notify_all();
+    dev_buffers_.clear();
  }
 }


--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -89,7 +89,7 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
  bool done_;

  // Map to keep tensorrt input buffers and sizes keyed with buffer names
-  const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+  std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;

  bool calib_running_;
  bool batch_is_set_;