[xpu][infer] support runtime configs (#53595)

e135069d · zhupengyang · GitHub · d327d3e1 · e135069d · e135069d
17 changed file
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
    } else {
      conv_bias.push_back(0);
    }
-    if (conv->Op()->HasAttr("padding_algorithm")) {
+    conv2d_xpu_op_desc.SetAttr(
-      conv2d_xpu_op_desc.SetAttr(
+        "padding_algorithm",
-          "padding_algorithm",
+        conv->Op()->GetAttrIfExists<std::string>("padding_algorithm"));
-          PADDLE_GET_CONST(std::string,
-                           conv->Op()->GetAttr("padding_algorithm")));
-    }
    auto conv_paddings =
        PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings"));
    if (conv_paddings.size() == 2) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init(
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
-  if (config_.use_xpu_ && config_.use_external_stream_) {
+  if (config_.use_xpu_) {
    private_context_ = true;
-  }
+    if (!status_is_cloned_ && config_.external_stream_enabled()) {
-  if (private_context_) {
-    if (!status_is_cloned_) {
      predictor_stream_ = config_.GetExecStream();
    }
-    // NOTE: If the external_stream equals to global_device_contexts's stream,
+    auto *global_context = static_cast<phi::XPUContext *>(
-    // then fallback.
+        platform::DeviceContextPool::Instance().Get(place_));
-    auto global_stream =
+    auto global_stream = global_context->stream();
-        static_cast<phi::XPUContext *>(
+    if (predictor_stream_ == nullptr) {
-            platform::DeviceContextPool::Instance().Get(place_))
+      predictor_stream_ = global_stream;
-            ->stream();
-    if (predictor_stream_ != global_stream) {
-      InitResourceManager(predictor_stream_);
-      InitDeviceContexts();
    }
+    InitDeviceContexts();
  }
 #endif
  inference::DisplayMemoryInfo(place_, "Init predictor");
  return true;
 }
@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  predictor_stream_ =
      ResourceManager::Instance().InitGPUResource(place_, stream);
-#elif defined(PADDLE_WITH_XPU)
-  predictor_stream_ =
-      ResourceManager::Instance().InitXPUResource(place_, stream);
 #endif
 }
 void AnalysisPredictor::InitDeviceContexts() {
-// Init GPUContext.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // Init GPUContext.
  if (place_.GetType() == phi::AllocationType::GPU) {
    device_contexts_.emplace(
        place_, std::async(std::launch::deferred, [=] {
@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() {
        }));
  }
 #endif
-#if defined(PADDLE_WITH_XPU)
+#ifdef PADDLE_WITH_XPU
  if (place_.GetType() == phi::AllocationType::XPU) {
    device_contexts_.emplace(
        place_, std::async(std::launch::deferred, [=] {
-          auto *xpu_resource =
-              ResourceManager::Instance().GetXPUResource(predictor_stream_);
          auto &instance = memory::allocation::AllocatorFacade::Instance();
          auto *xpu_context = new InferXPUContext(place_);
          xpu_context->SetAllocator(instance.GetAllocator(place_).get());
@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() {
              instance.GetZeroAllocator(place_).get());
          xpu_context->SetHostZeroAllocator(
              instance.GetZeroAllocator(platform::CPUPlace()).get());
-          xpu_context->SetStream(xpu_resource->GetStream());
+          xpu_context->SetStream(predictor_stream_);
-          xpu_context->SetDriverVersion(xpu_resource->GetDriverVersion());
-          xpu_context->SetRuntimeVersion(xpu_resource->GetRuntimeVersion());
-          xpu_context->SetXpuVersion(xpu_resource->GetXpuVersion());
          return std::unique_ptr<phi::DeviceContext>(xpu_context);
        }));
  }
 #endif
-  // TODO(Inference): Support other backends.
 }
 void *AnalysisPredictor::GetExecStream() const {
@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const {
 bool AnalysisPredictor::PrepareScope(
    const std::shared_ptr<framework::Scope> &parent_scope) {
+#ifdef PADDLE_WITH_XPU
+  // Set "XPU_PADDLE_L3_SIZE" to "0" to avoid malloc l3 cache when xpu_context
+  // init.
+  setenv("XPU_PADDLE_L3_SIZE", "0", 0);
+#endif
  if (parent_scope) {
    PADDLE_ENFORCE_NOT_NULL(
        parent_scope,
@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() {
    argument_->SetCustomDeviceId(config_.custom_device_id());
  }
 #endif
 #ifdef PADDLE_WITH_XPU
  argument_->SetUseXpu(config_.use_xpu_);
  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
 }
 #endif
-bool AnalysisPredictor::ExpRunWithExternalStream(void *stream) {
+bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
-#if defined(PADDLE_WITH_XPU)
+#ifdef PADDLE_WITH_XPU
-  if (!private_context_) {
+  PADDLE_ENFORCE(
-    PADDLE_THROW(platform::errors::Fatal(
+      private_context_,
-        "Please use config.SetExecStream to init resources, and then we "
+      paddle::platform::errors::Fatal(
-        "will bind resources to execution stream."));
+          "Must use private context if run predictor with external config."));
-  }
-  if (stream != predictor_stream_) {
+  auto *dev_ctxs = reinterpret_cast<const std::map<
+      phi::Place,
+      std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+      this->GetDeviceContexts());
+  auto *dev_ctx =
+      static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
+  auto xpu_runtime_config =
+      reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
+  auto *stream = xpu_runtime_config->stream;
+  if (stream != nullptr && stream != predictor_stream_) {
    paddle::platform::XPUStreamSync(
        static_cast<paddle::xpuStream>(predictor_stream_));
-    ResourceManager::Instance().XpuResourceReBindStream(predictor_stream_,
-                                                        stream);
    predictor_stream_ = stream;
-    auto *dev_ctxs = reinterpret_cast<const std::map<
-        phi::Place,
-        std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
-        this->GetDeviceContexts());
-    auto *dev_ctx =
-        static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
    dev_ctx->SetStream(stream);
  }
-  return ZeroCopyRun();
+  size_t l3_size = xpu_runtime_config->l3_size;
+  void *l3_ptr = xpu_runtime_config->l3_ptr;
+  size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
+  PADDLE_ENFORCE_LE(
+      l3_autotune_size,
+      l3_size,
+      phi::errors::InvalidArgument(
+          "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
+          l3_autotune_size,
+          l3_size));
+  dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size);
+  bool ret = ZeroCopyRun();
+  dev_ctx->L3CacheAutotune();
+  return ret;
 #endif
  return false;
 }
@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() {
  if (predictor_stream_ != nullptr) {
    ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
  }
-#elif defined(PADDLE_WITH_XPU)
-  if (predictor_stream_ != nullptr) {
-    ResourceManager::Instance().DestroyXPUResource(predictor_stream_);
-  }
 #endif
  if (place_.GetType() != phi::AllocationType::UNDEFINED) {
@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
  return false;
 }
-bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
-                                          void *stream) {
+bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
+                                         void *config) {
  auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
-  return pred->ExpRunWithExternalStream(stream);
+  return pred->ExpRunWithRuntimeConfig(config);
 }
 void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor {
  // Note: Can only be used under thread_local semantics.
  bool ExpRunWithExternalStream(void *stream);
+  // Note: Can only be used under thread_local semantics.
+  bool ExpRunWithRuntimeConfig(void *config);
  ///
  /// \brief Get the execution stream on devices with a concept of stream,
  /// otherwise returns nullptr.

--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -13,7 +13,11 @@
 // limitations under the License.
 #include "paddle/fluid/inference/api/infer_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#ifdef PADDLE_WITH_XPU
+#include "xpu/runtime.h"
+#endif
+#include "glog/logging.h"
 namespace paddle {
@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
    : phi::GPUContext(place, false) {}
 #endif
-#if defined(PADDLE_WITH_XPU)
+#ifdef PADDLE_WITH_XPU
 InferXPUContext::InferXPUContext(const phi::Place& place)
    : phi::XPUContext(place) {}
+void* InferXPUContext::Alloc(phi::TensorBase* tensor,
+                             phi::DataType dtype,
+                             size_t requested_size,
+                             bool pinned,
+                             bool fake_alloc) const {
+  size_t size = tensor->numel() * phi::SizeOf(tensor->dtype());
+  if (l3_autotune_size_ > 0 && holder_map_.empty()) {
+    void* data_ptr =
+        DeviceContext::Alloc(tensor, dtype, requested_size, pinned, fake_alloc);
+    phi::XPUL3CacheBlock* l3_block = nullptr;
+    phi::Allocation* holder =
+        reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
+    if (holder_l3_blocks_.count(holder) == 0) {
+      l3_block = new phi::XPUL3CacheBlock();
+      holder_l3_blocks_[holder] = l3_block;
+      l3_blocks_.push_back(l3_block);
+    } else {
+      l3_block = holder_l3_blocks_[holder];
+    }
+    l3_block->Record(size);
+    return data_ptr;
+  } else if (l3_autotune_size_ > 0 && !holder_map_.empty()) {
+    phi::Allocation* holder =
+        reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
+    auto holder_iter = holder_map_.find(holder);
+    if (holder_iter != holder_map_.end()) {
+      auto& holder_pair = holder_iter->second;
+      auto* swap_holder = holder_pair.first;
+      bool& swap_holder_is_l3 = holder_pair.second;
+      if (swap_holder_is_l3 && swap_holder->size() >= size) {
+        swap(*holder, *swap_holder);
+        swap_holder_is_l3 = false;
+      } else if (!swap_holder_is_l3 && holder->size() < size) {
+        swap(*holder, *swap_holder);
+        swap_holder_is_l3 = true;
+      }
+    }
+    return DeviceContext::Alloc(
+        tensor, dtype, requested_size, pinned, fake_alloc);
+  } else {
+    return DeviceContext::Alloc(
+        tensor, dtype, requested_size, pinned, fake_alloc);
+  }
+}
+void InferXPUContext::SetL3Info(size_t l3_size,
+                                void* l3_ptr,
+                                size_t l3_autotune_size) {
+  if (l3_ptr == nullptr) {
+    if (l3_size_ != l3_size) {
+      if (l3_owned_) {
+        xpu_free(l3_ptr_);
+      }
+      if (l3_size > 0) {
+        xpu_malloc(&l3_ptr_, l3_size, XPU_MEM_L3);
+        if (l3_ptr_ != nullptr) {
+          VLOG(3) << "remalloc l3(" << l3_size << ") success.";
+          l3_size_ = l3_size;
+          l3_owned_ = true;
+          l3_autotune_size_ = l3_autotune_size;
+        } else {
+          VLOG(3) << "malloc l3(" << l3_size << ") failed. No l3 will be used.";
+          l3_size_ = 0;
+          l3_owned_ = false;
+          l3_autotune_size_ = 0;
+        }
+      }
+    }
+  } else {
+    if (l3_owned_) {
+      xpu_free(l3_ptr_);
+    }
+    l3_ptr_ = l3_ptr;
+    l3_size_ = l3_size;
+    l3_autotune_size_ = l3_autotune_size;
+  }
+  if (l3_autotune_size_ == 0) {
+    x_context()->_l3_mgr.set(l3_ptr_, l3_size_);
+  }
+}
+void InferXPUContext::L3CacheAutotune() {
+  if (l3_autotune_size_ == 0) return;
+  if (holder_map_.empty()) {
+    l3_plan_.RunAutotune(l3_blocks_, l3_size_);
+    auto* plan = l3_plan_.plan();
+    int8_t* cur_l3_ptr = reinterpret_cast<int8_t*>(l3_ptr_);
+    for (size_t i = 0; i < l3_blocks_.size(); i++) {
+      size_t block_size = plan->at(i);
+      if (block_size > 0) {
+        l3_blocks_[i]->Set(cur_l3_ptr, block_size);
+        cur_l3_ptr += block_size;
+      }
+    }
+    x_context()->_l3_mgr.set(
+        reinterpret_cast<int8_t*>(l3_ptr_) + l3_size_ - plan->back(),
+        plan->back());
+    for (auto holder_l3_block : holder_l3_blocks_) {
+      auto* l3_block = holder_l3_block.second;
+      if (l3_block->size() > 0) {
+        auto* holder = holder_l3_block.first;
+        auto place = holder->place();
+        phi::Allocation* l3_holder =
+            new phi::Allocation(l3_block->data(), l3_block->size(), place);
+        holder_map_[holder] = std::make_pair(l3_holder, true);
+      }
+    }
+  } else {
+    for (auto& holders : holder_map_) {
+      auto* holder = holders.first;
+      auto& holder_pair = holders.second;
+      if (!holder_pair.second) {
+        swap(*holder, *(holder_pair.first));
+        holder_pair.second = true;
+      }
+    }
+  }
+}
 #endif
 }  // namespace paddle
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -15,6 +15,9 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/place.h"
+#ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
+#endif
 namespace paddle {
@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext {
 };
 #endif
-#if defined(PADDLE_WITH_XPU)
+#ifdef PADDLE_WITH_XPU
 class InferXPUContext : public phi::XPUContext {
 public:
  explicit InferXPUContext(const phi::Place& place);
-  using phi::XPUContext::SetDriverVersion;
-  using phi::XPUContext::SetRuntimeVersion;
+  void* Alloc(phi::TensorBase* tensor,
-  using phi::XPUContext::SetStream;
+              phi::DataType dtype,
-  using phi::XPUContext::SetXpuVersion;
+              size_t requested_size = 0,
+              bool pinned = false,
+              bool fake_alloc = false) const override;
+  void SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size);
+  void L3CacheAutotune();
+ private:
+  size_t l3_size_{0};
+  void* l3_ptr_{nullptr};
+  bool l3_owned_{false};
+  size_t l3_autotune_size_{0};
+  mutable std::vector<phi::XPUL3CacheBlock*> l3_blocks_;
+  mutable std::unordered_map<phi::Allocation*, phi::XPUL3CacheBlock*>
+      holder_l3_blocks_;
+  mutable std::unordered_map<phi::Allocation*,
+                             std::pair<phi::Allocation*, bool>>
+      holder_map_;
+  phi::XPUL3Planner l3_plan_;
 };
 #endif
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -471,6 +471,13 @@ class Predictor;
 class Tensor;
 using Config = paddle::AnalysisConfig;
 namespace experimental {
+struct XpuRuntimeConfig {
+  void* stream{nullptr};
+  size_t l3_size{16773120};
+  void* l3_ptr{nullptr};
+  size_t l3_autotune_size{0};
+};
 // Unstable interface, may be modified or deleted in the future.
 class PD_INFER_DECL InternalUtils {
 public:
@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils {
                                    cudaStream_t stream);
  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                    hipStream_t stream);
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+  static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);
-                                    void* stream);
  static void UpdateConfigInterleaved(paddle_infer::Config* c,
                                      bool with_interleaved);

--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -41,9 +41,6 @@
 #include "paddle/phi/backends/dynload/cusparse.h"
 #endif  // PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_XPU
-#include "paddle/phi/backends/xpu/xpu_info.h"
-#endif
 namespace paddle {
 namespace internal {
@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const {
  if (ref_count_.count(stream) == 0) return 0;
  return ref_count_.at(stream);
 }
-#endif
-#if defined(PADDLE_WITH_XPU)
-// XPUContextResource
-XPUContextResource::XPUContextResource(const phi::Place& place, void* stream)
-    : place_(place) {
-  InitXPUResource(stream);
-}
-XPUContextResource::~XPUContextResource() {}
-void XPUContextResource::InitXPUResource(void* stream) {
-  phi::backends::xpu::XPUDeviceGuard guard(place_.device);
-  if (stream) {
-    owned_stream_ = false;
-    stream_ = stream;
-  }
-  InitXpuProperties();
-}
-void XPUContextResource::InitXpuProperties() {
-  phi::backends::xpu::XPUDeviceGuard guard(place_.device);
-  driver_version_ = phi::backends::xpu::GetDriverVersion();
-  runtime_version_ = phi::backends::xpu::GetRuntimeVersion();
-  xpu_version_ =
-      static_cast<int>(phi::backends::xpu::get_xpu_version(place_.device));
-}
-void* XPUContextResource::GetStream() const { return stream_; }
-int XPUContextResource::GetDriverVersion() const { return driver_version_; }
-int XPUContextResource::GetRuntimeVersion() const { return runtime_version_; }
-int XPUContextResource::GetXpuVersion() const { return xpu_version_; }
-void XPUContextResource::ReBindStream(void* stream) {
-  owned_stream_ = false;
-  stream_ = stream;
-}
-// XPUContextResource End.
-// Resource Manager
-void* ResourceManager::InitXPUResource(const phi::Place& place, void* stream) {
-  std::lock_guard<std::mutex> lock_gurad(xpu_mutex_);
-  if (xpu_resources_.count(stream)) {
-    Increase(stream);
-    return stream;
-  } else {
-    std::unique_ptr<XPUContextResource> resource{
-        new XPUContextResource(place, stream)};
-    void* s = resource->GetStream();
-    ref_count_[s] = 1;
-    xpu_resources_.emplace(s, std::move(resource));
-    return s;
-  }
-}
-XPUContextResource* ResourceManager::GetXPUResource(void* stream) const {
-  PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in xpu_resources.", stream));
-  return xpu_resources_.at(stream).get();
-}
-void ResourceManager::XpuResourceReBindStream(void* old_stream,
-                                              void* new_stream) {
-  PADDLE_ENFORCE_EQ(
-      xpu_resources_.count(old_stream),
-      true,
-      platform::errors::InvalidArgument(
-          "The stream[%p] not found in xpu_resources.", old_stream));
-  auto xpu_resource = std::move(xpu_resources_.at(old_stream));
-  DestroyXPUResource(old_stream);
-  PADDLE_ENFORCE_EQ(
-      ref_count_.count(old_stream),
-      0,
-      platform::errors::Fatal("xpu resources rebind stream failed."));
-  xpu_resource->ReBindStream(new_stream);
-  ref_count_[new_stream]++;
-  xpu_resources_.emplace(new_stream, std::move(xpu_resource));
-}
-void ResourceManager::DestroyXPUResource(void* stream) {
-  PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in xpu_resources.", stream));
-  Decrease(stream);
-}
-void ResourceManager::Decrease(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
-  --ref_count_[stream];
-  if (ref_count_[stream] == 0) {
-    ref_count_.erase(stream);
-    xpu_resources_.erase(stream);
-  }
-}
-void ResourceManager::Increase(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
-  ++ref_count_[stream];
-}
-int ResourceManager::RefCount(void* stream) const {
-  if (ref_count_.count(stream) == 0) return 0;
-  return ref_count_.at(stream);
-}
-// Resource Manager End.
 #endif
 }  // namespace paddle
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -124,33 +124,6 @@ class GPUContextResource {
 };
 #endif
-#if defined(PADDLE_WITH_XPU)
-class XPUContextResource {
- public:
-  explicit XPUContextResource(const phi::Place& place, void* stream);
-  ~XPUContextResource();
-  phi::Place Place() const;
-  void* GetStream() const;
-  int GetDriverVersion() const;
-  int GetRuntimeVersion() const;
-  int GetXpuVersion() const;
-  void ReBindStream(void* stream);
- private:
-  void InitXPUResource(void* stream);
-  void InitXpuProperties();
- private:
-  bool owned_stream_{true};
-  void* stream_;
-  phi::Place place_;
-  int driver_version_;
-  int runtime_version_;
-  int xpu_version_;
-};  // class XPUContextResource
-#endif
 class ResourceManager {
 public:
  ResourceManager() = default;
@@ -168,9 +141,8 @@ class ResourceManager {
  std::mutex cpu_mutex_;
  std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
-// GPU Resource
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // GPU Resource
 public:
  void* InitGPUResource(const phi::Place& place, void* stream);
  void DestroyGPUResource(void* stream);
@@ -190,28 +162,6 @@ class ResourceManager {
      gpu_resources_;
 #endif
-// XPU Resource
-#if defined(PADDLE_WITH_XPU)
- public:
-  void* InitXPUResource(const phi::Place& place, void* stream);
-  void DestroyXPUResource(void* stream);
-  XPUContextResource* GetXPUResource(void* stream) const;
-  int RefCount(void* stream) const;
-  void XpuResourceReBindStream(void* old_stream, void* new_stream);
- private:
-  void Decrease(void* stream);
-  void Increase(void* stream);
- private:
-  std::mutex xpu_mutex_;
-  // a stream corresponding to a series of resource.
-  std::map<void* /*stream*/, std::atomic<int>> ref_count_;
-  std::map<void* /*stream*/, std::unique_ptr<XPUContextResource>>
-      xpu_resources_;
-#endif
 private:
  DISABLE_COPY_AND_ASSIGN(ResourceManager);
 };

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -38,7 +38,7 @@ endif()
 if(WITH_XPU)
  list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
  list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
-       xpu/xpu2_op_list.cc)
+       xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc)
 endif()
 if(WITH_MKLDNN)

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -42,11 +42,13 @@ struct XPUContext::Impl {
    auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
    for (unsigned int i = 0; i < selected_xpus.size(); i++) {
      if (place_.GetDeviceId() == selected_xpus[i]) {
-        if (l3ptrs[place_.GetDeviceId()] == nullptr) {
+        if (l3ptrs[place_.GetDeviceId()] != nullptr) {
-          xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
+          xpu_free(l3ptrs[place_.GetDeviceId()]);
-                     l3_size,
+          l3ptrs[place_.GetDeviceId()] = nullptr;
-                     XPU_MEM_L3);
        }
+        xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
+                   l3_size,
+                   XPU_MEM_L3);
        if (l3ptrs[place_.GetDeviceId()] != nullptr) {
          context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
          VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())

--- a/paddle/phi/backends/xpu/xpu_l3_strategy.cc
+++ b/paddle/phi/backends/xpu/xpu_l3_strategy.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
+#include "glog/logging.h"
+namespace phi {
+void XPUL3CacheBlock::Set(void* addr, size_t size) {
+  if (addr == nullptr || size == 0) {
+    LOG(FATAL) << "Set XPUL3CacheBlock Size as Zero";
+  }
+  addr_ = addr;
+  size_ = size;
+}
+void XPUL3Planner::RunAutotune(
+    const std::vector<XPUL3CacheBlock*>& l3_block_dict, size_t l3_size) {
+  if (l3_block_dict.size() == 0 || l3_size <= 0 || !plan_.empty()) {
+    return;
+  }
+  VLOG(3) << "AutoTune XPU L3 Cache Block Start.";
+  struct node {
+    size_t weights = 0;
+    size_t scores = 0;
+    std::vector<size_t> choices{0};
+  };
+  std::vector<std::vector<node>> records;
+  std::vector<size_t> record_map;
+  size_t total_scores = 0;
+  for (size_t block_idx = 0; block_idx < l3_block_dict.size(); block_idx++) {
+    XPUL3CacheBlock* cur_block = l3_block_dict[block_idx];
+    std::vector<size_t>& history = cur_block->history_;
+    auto history_size = history.size();
+    size_t score = 0;
+    VLOG(3) << "Block Idx is " << block_idx;
+    if (history_size > 1) {
+      std::vector<node> block_nodes{node()};
+      std::sort(history.begin(), history.end());
+      for (size_t i = 0; i < history_size; i++) {
+        VLOG(3) << "Size History : " << i << " is " << history[i];
+        if (history[i] > l3_size) {
+          break;
+        }
+        score += history[i];
+        if (i == history_size - 1 || history[i + 1] != history[i]) {
+          node cur_node;
+          cur_node.weights = history[i];
+          cur_node.choices = {history[i]};
+          cur_node.scores = score;
+          block_nodes.push_back(cur_node);
+          VLOG(3) << "Node Weights is:" << cur_node.weights
+                  << ", Node Scores is: " << score;
+        }
+      }
+      total_scores += score;
+      records.push_back(block_nodes);
+      record_map.push_back(block_idx);
+    }
+  }
+  if (records.size() <= 0) {
+    return;
+  }
+  std::vector<node> res(records[0]);
+  for (size_t block_idx = 1; block_idx < records.size(); block_idx++) {
+    std::vector<node> new_nodes;
+    for (size_t node_idx = 0; node_idx < records[block_idx].size();
+         node_idx++) {
+      for (size_t res_idx = 0; res_idx < res.size(); res_idx++) {
+        node cur_node;
+        size_t cur_weights =
+            records[block_idx][node_idx].weights + res[res_idx].weights;
+        if (cur_weights > l3_size) {
+          break;
+        }
+        cur_node.scores =
+            records[block_idx][node_idx].scores + res[res_idx].scores;
+        cur_node.weights = cur_weights;
+        cur_node.choices = res[res_idx].choices;
+        cur_node.choices.push_back(records[block_idx][node_idx].choices[0]);
+        new_nodes.push_back(cur_node);
+      }
+    }
+    struct {
+      bool operator()(node a, node b) const {
+        if (a.weights < b.weights) {
+          return true;
+        } else if (a.weights == b.weights) {
+          return a.scores > b.scores;
+        } else {
+          return false;
+        }
+      }
+    } customLess;
+    std::sort(new_nodes.begin(), new_nodes.end(), customLess);
+    std::vector<bool> stay(new_nodes.size(), true);
+    for (int i = new_nodes.size() - 1; i >= 0; i--) {
+      for (int j = i - 1; j >= 0; j--) {
+        if (new_nodes[j].scores >= new_nodes[i].scores) {
+          stay[i] = false;
+          break;
+        }
+      }
+    }
+    res.clear();
+    for (size_t i = 0; i < new_nodes.size(); i++) {
+      if (stay[i] == true) {
+        res.push_back(new_nodes[i]);
+      }
+    }
+    VLOG(3) << "XPU L3 Block IDX is " << block_idx
+            << ", Choices before filter are " << new_nodes.size()
+            << ", Choices after filter are " << res.size();
+  }
+  // final result: res.back().choices
+  //               std::vector<size_t> record_map;
+  for (size_t i = 0; i < res.back().choices.size(); i++) {
+    VLOG(3) << "BLOCK IDX is " << i << ", Acquired L3 Size is "
+            << res.back().choices[i];
+  }
+  double l3_global_ratio = static_cast<double>(res.back().scores) /
+                           static_cast<double>(total_scores);
+  VLOG(3) << "Tensor Space in L3 / Tensor Space in Global :"
+          << l3_global_ratio * 100 << " %";
+  size_t block_l3_size =
+      std::accumulate(res.back().choices.begin(), res.back().choices.end(), 0);
+  size_t xdnn_ctx_l3_size = (l3_size - block_l3_size) / 64 * 64;
+  VLOG(3) << "Block L3 Size : " << block_l3_size
+          << ", XDNN Ctx L3 Size : " << xdnn_ctx_l3_size;
+  plan_.resize(l3_block_dict.size() + 1, 0);
+  for (size_t i = 0; i < res.back().choices.size(); i++) {
+    plan_[record_map[i]] = res.back().choices[i];
+  }
+  plan_[l3_block_dict.size()] = xdnn_ctx_l3_size;
+  VLOG(3) << "AutoTune XPU L3 Cache Block End.";
+}
+}  // namespace phi
--- a/paddle/phi/backends/xpu/xpu_l3_strategy.h
+++ b/paddle/phi/backends/xpu/xpu_l3_strategy.h
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <numeric>
+#include <vector>
+namespace phi {
+struct XPUL3CacheBlock {
+ public:
+  void Clear() {
+    addr_ = nullptr;
+    size_ = 0;
+    history_.clear();
+  }
+  void Set(void* addr, size_t size);
+  void Record(size_t size) { history_.push_back(size); }
+  void* data() { return addr_; }
+  size_t size() { return size_; }
+ private:
+  void* addr_{nullptr};
+  size_t size_{0};
+ public:
+  std::vector<size_t> history_;
+};
+class XPUL3Planner {
+ public:
+  void RunAutotune(const std::vector<XPUL3CacheBlock*>& l3_block_dict,
+                   size_t l3_size);
+  std::vector<size_t>* plan() { return &plan_; }
+ private:
+  std::vector<size_t> plan_;
+};
+}  // namespace phi
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -393,11 +393,8 @@ template <typename T>
 T* DeviceContext::Alloc(TensorBase* tensor,
                        size_t requested_size,
                        bool pinned) const {
-  if (pinned) {
+  DataType dtype = phi::CppTypeToDataType<T>::Type();
-    return impl_->Alloc<T>(
+  return static_cast<T*>(this->Alloc(tensor, dtype, requested_size, pinned));
-        tensor, GetPinnedPlace(GetPlace()), requested_size, pinned);
-  }
-  return impl_->Alloc<T>(tensor, GetPlace(), requested_size, pinned);
 }
 void* DeviceContext::HostAlloc(TensorBase* tensor,

--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext {
  /**
   * @brief Allocate device memory for tensor.
   */
-  void* Alloc(TensorBase*,
+  virtual void* Alloc(TensorBase*,
-              DataType dtype,
+                      DataType dtype,
-              size_t requested_size = 0,
+                      size_t requested_size = 0,
-              bool pinned = false,
+                      bool pinned = false,
-              bool fake_alloc = false) const;
+                      bool fake_alloc = false) const;
  template <typename T>
  T* Alloc(TensorBase* tensor,

--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
      --repeat=10)
  endif()
+  if(WITH_XPU)
+    inference_analysis_test(
+      xpu_runtime_config_resnet50_test
+      SRCS
+      xpu_runtime_config_resnet50_test.cc
+      EXTRA_DEPS
+      paddle_inference_shared
+      ARGS
+      --infer_model=${RESNET50_MODEL_DIR})
+  endif()
  set(inference_deps ${analysis_deps} paddle_inference_api analysis
                     naive_executor ${GLOB_PASS_LIB})

--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -17,10 +17,6 @@
 #if defined(PADDLE_WITH_CUDA)
 #include <cuda_runtime.h>
 #endif
-#if defined(PADDLE_WITH_XPU)
-#include "xpu/runtime.h"
-#include "xpu/xdnn.h"
-#endif
 #include <glog/logging.h>
 #include <gtest/gtest.h>
@@ -671,57 +667,6 @@ TEST(Predictor, Streams) {
 }
 #endif
-#if defined(PADDLE_WITH_XPU)
-TEST(Predictor, XPUStreams) {
-  // external stream
-  {
-    auto context = baidu::xpu::api::create_context();
-    xpu_stream_create(&context->xpu_stream);
-    Config config;
-    config.SetModel(FLAGS_dirname);
-    config.EnableXpu();
-    config.SetExecStream(static_cast<void*>(context->xpu_stream));
-    CHECK_EQ(config.external_stream_enabled(), true);
-    auto predictor = CreatePredictor(config);
-    auto stream = predictor->GetExecStream();
-    CHECK_EQ(static_cast<void*>(context->xpu_stream), stream);
-    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream));
-    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1);
-  }
-  // 2 predictor on 2 stream
-  {
-    auto context1 = baidu::xpu::api::create_context();
-    xpu_stream_create(&context1->xpu_stream);
-    Config config;
-    config.SetModel(FLAGS_dirname);
-    config.EnableXpu();
-    config.SetExecStream(static_cast<void*>(context1->xpu_stream));
-    auto predictor = CreatePredictor(config);
-    auto stream1 = predictor->GetExecStream();
-    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream1));
-    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream1), 1);
-    auto context2 = baidu::xpu::api::create_context();
-    xpu_stream_create(&context2->xpu_stream);
-    Config config2;
-    config2.SetModel(FLAGS_dirname);
-    config2.EnableXpu();
-    config2.SetExecStream(static_cast<void*>(context2->xpu_stream));
-    auto predictor2 = CreatePredictor(config2);
-    auto stream2 = predictor2->GetExecStream();
-    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream2));
-    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 1);
-    CHECK_NE(stream1, stream2);
-  }
-}
-#endif
 TEST(AnalysisPredictor, OutputHookFunc) {
  auto hookfunc = [](const std::string& type,
                     const std::string& var_name,

--- a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include "gflags/gflags.h"
+#include "test/cpp/inference/api/tester_helper.h"
+#include "xpu/runtime.h"
+#include "xpu/xdnn.h"
+namespace paddle_infer {
+static const std::vector<float> TRUTH_VALUES = {
+    127.779f,  738.165f,  1013.22f,  -438.17f,  366.401f,  927.659f,  736.222f,
+    -633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
+    -242.675f, 117.448f,  -801.723f, -391.514f, -404.818f, 454.16f,   515.48f,
+    -133.031f, 69.293f,   590.096f,  -1434.69f, -1070.89f, 307.074f,  400.525f,
+    -316.12f,  -587.125f, -161.056f, 800.363f,  -96.4708f, 748.706f,  868.174f,
+    -447.938f, 112.737f,  1127.2f,   47.4355f,  677.72f,   593.186f,  -336.4f,
+    551.362f,  397.823f,  78.3979f,  -715.398f, 405.969f,  404.256f,  246.019f,
+    -8.42969f, 131.365f,  -648.051f};
+void PrepareInput(std::shared_ptr<Predictor> predictor) {
+  const int batch = 1;
+  const int channel = 3;
+  const int height = 318;
+  const int width = 318;
+  const int input_num = batch * channel * height * width;
+  std::vector<float> input(input_num, 1);
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch, channel, height, width});
+  input_t->CopyFromCpu(input.data());
+}
+void CompareOutput(std::shared_ptr<Predictor> predictor) {
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  size_t out_num = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  std::vector<float> out_data;
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  float* data_o = out_data.data();
+  for (size_t j = 0; j < out_num; j += 10) {
+    EXPECT_NEAR(
+        (data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
+  }
+}
+Config XpuConfig() {
+  std::string model_dir = FLAGS_infer_model + "/" + "model";
+  Config config;
+  config.SetModel(model_dir + "/model", model_dir + "/params");
+  config.EnableXpu();
+  return config;
+}
+TEST(resnet50_xpu, basic) {
+  Config config = XpuConfig();
+  auto predictor = CreatePredictor(config);
+  PrepareInput(predictor);
+  predictor->Run();
+  CompareOutput(predictor);
+}
+#define RUN_WITH_RUNTIME_CONFIG(idx_, config_)                             \
+  Config config##idx_ = XpuConfig();                                       \
+  auto predictor##idx_ = CreatePredictor(config##idx_);                    \
+  PrepareInput(predictor##idx_);                                           \
+  experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
+                                                    &config_);             \
+  CompareOutput(predictor##idx_);                                          \
+  CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream);
+TEST(runtime_stream, null_stream) {
+  experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
+  RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
+}
+TEST(runtime_stream, new_stream) {
+  void* stream = nullptr;
+  xpu_stream_create(&stream);
+  CHECK_NOTNULL(stream);
+  {
+    experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
+    RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
+  }
+  xpu_stream_destroy(stream);
+}
+TEST(runtime_stream, 2_null_stream) {
+  experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
+  RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
+  RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
+}
+TEST(runtime_stream, null_and_new_stream) {
+  experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0};
+  void* stream = nullptr;
+  xpu_stream_create(&stream);
+  CHECK_NOTNULL(stream);
+  {
+    experimental::XpuRuntimeConfig xpu_runtime_config1 = {
+        stream, 0, nullptr, 0};
+    RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
+    RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
+  }
+  xpu_stream_destroy(stream);
+}
+TEST(runtime_stream, 2_new_same_stream) {
+  void* stream = nullptr;
+  xpu_stream_create(&stream);
+  CHECK_NOTNULL(stream);
+  experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
+  {
+    RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
+    RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
+  }
+  xpu_stream_destroy(stream);
+}
+TEST(runtime_stream, 2_new_different_stream) {
+  void* stream0 = nullptr;
+  xpu_stream_create(&stream0);
+  CHECK_NOTNULL(stream0);
+  experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
+  void* stream1 = nullptr;
+  xpu_stream_create(&stream1);
+  CHECK_NOTNULL(stream1);
+  experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
+  {
+    RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
+    RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
+  }
+  xpu_stream_destroy(stream0);
+  xpu_stream_destroy(stream1);
+}
+void RunPredictorWithRuntimeConfig(
+    std::shared_ptr<Predictor> predictor,
+    experimental::XpuRuntimeConfig runtime_config) {
+  PrepareInput(predictor);
+  experimental::InternalUtils::RunWithRuntimeConfig(predictor.get(),
+                                                    &runtime_config);
+  CompareOutput(predictor);
+  CHECK_EQ(predictor->GetExecStream(), runtime_config.stream);
+}
+TEST(runtime_stream, 2_thread) {
+  void* stream0 = nullptr;
+  xpu_stream_create(&stream0);
+  CHECK_NOTNULL(stream0);
+  experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
+  void* stream1 = nullptr;
+  xpu_stream_create(&stream1);
+  CHECK_NOTNULL(stream1);
+  experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
+  {
+    RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
+    RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
+    std::thread t0(
+        RunPredictorWithRuntimeConfig, predictor0, xpu_runtime_config0);
+    std::thread t1(
+        RunPredictorWithRuntimeConfig, predictor1, xpu_runtime_config1);
+    t0.join();
+    t1.join();
+  }
+  xpu_stream_destroy(stream0);
+  xpu_stream_destroy(stream1);
+}
+}  // namespace paddle_infer