enable custom device save model on device memory && fix conflict (#48221)

b6aa9f53 · engineer1109 · GitHub · 5eff6f01 · b6aa9f53 · b6aa9f53
4 changed file
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -371,6 +371,11 @@ struct Argument {
  // cinn compiler related
  DECL_ARGUMENT_FIELD(use_cinn_compiler, UseCinnCompiler, bool);
+  // custom device
+  DECL_ARGUMENT_FIELD(use_custom_device, UseCustomDevice, bool);
+  DECL_ARGUMENT_FIELD(custom_device_type, CustomDeviceType, std::string);
+  DECL_ARGUMENT_FIELD(custom_device_id, CustomDeviceId, int);
 private:
  std::unordered_set<std::string> valid_fields_;
 };

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include <cstdlib>
 #include <string>
 #include <unordered_set>
@@ -26,6 +27,11 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/data_type.h"
+DEFINE_bool(
+    custom_model_save_cpu,
+    false,
+    "Keep old mode for developers, the model is saved on cpu not device.");
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -71,9 +77,9 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
    }
  }
 }
+#endif
-#else
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
  if (!argument->use_gpu()) return;
@@ -148,7 +154,62 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
    }
  }
 }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
+    Argument *argument) {
+  if (!argument->use_custom_device()) return;
+  // On old mode, the model is saved on cpu not device.
+  if (argument->custom_device_type() == "OpenCL") {
+    PADDLE_ENFORCE_EQ(
+        FLAGS_custom_model_save_cpu,
+        false,
+        phi::errors::InvalidArgument(
+            "'FLAGS_custom_model_save_cpu = false' is only for the developers "
+            "who have not completed custom device memory settings. Setting to "
+            "true will make "
+            "model memory reserve on the cpu, and make inference slower."));
+  }
+  if (FLAGS_custom_model_save_cpu) return;
+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+  LOG(INFO) << "Sync params from CPU to CustomDevice"
+            << argument->custom_device_type() << "/"
+            << argument->custom_device_id();
+  platform::Place place = platform::CustomPlace(argument->custom_device_type(),
+                                                argument->custom_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));
+    if (var->IsType<phi::DenseTensor>() || var->IsType<phi::DenseTensor>()) {
+      auto *t = var->GetMutable<phi::DenseTensor>();
+      platform::CPUPlace cpu_place;
+      phi::DenseTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      t->clear();
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
 #endif
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
@@ -156,13 +217,20 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
      argument->scope_valid(),
      true,
      platform::errors::PreconditionNotMet("The scope field should be valid"));
 #ifdef PADDLE_WITH_ASCEND_CL
-  if (!argument->use_npu_valid()) return;
+  if (argument->use_npu_valid()) {
    CopyParamsToNpu(argument);
-#else
+  }
-  if (!argument->use_gpu_valid()) return;
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (argument->use_gpu_valid()) {
    CopyParamsToGpu(argument);
+  }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (argument->use_custom_device_valid()) {
+    CopyParamsToCustomDevice(argument);
+  }
 #endif
 }

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -37,9 +37,15 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 private:
 #ifdef PADDLE_WITH_ASCEND_CL
  void CopyParamsToNpu(Argument *argument);
-#else
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void CopyParamsToGpu(Argument *argument);
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  void CopyParamsToCustomDevice(Argument *argument);
+#endif
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1242,6 +1242,15 @@ void AnalysisPredictor::PrepareArgument() {
  }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  argument_.SetUseCustomDevice(config_.use_custom_device());
+  if (config_.use_custom_device()) {
+    LOG(INFO) << "CustomDevice is enabled";
+    argument_.SetCustomDeviceType(config_.custom_device_type());
+    argument_.SetCustomDeviceId(config_.custom_device_id());
+  }
+#endif
  auto *pass_builder = config_.pass_builder();
  // TODO(inference): Need to reconstruct the pass_builder, pass should be
  // processed in a single