cherry-pick from #14649

Add params sync pass test=release/1.2

cherry-pick from #14649
Add params sync pass test=release/1.2
afe829f3 · nhzlx · 25c2cdaf · afe829f3 · afe829f3 · afe829f3
9 changed file
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 set(analysis_deps ${analysis_deps}
        ir_graph_build_pass

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
  std::vector<std::string> passes({
      "ir_graph_build_pass", "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
  });
  for (const auto &pass : passes) {
    VLOG(2) << "Run pass " << pass;

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
  // so that the parameters will on the same device, or they will keep copying
  // between difference devices.
  platform::Place place;
-  if (argument->use_gpu()) {
-    PADDLE_ENFORCE(argument->gpu_device_id_valid());
-    place = platform::CUDAPlace(argument->gpu_device_id());
-  } else {
  place = platform::CPUPlace();
-  }
  if (argument->model_dir_valid()) {
    auto program =

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE(argument->scope_valid());
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+  platform::Place place;
+  // The parameters are on the cpu, therefore, synchronization is not necessary.
+  if (!argument->use_gpu()) return;
+  LOG(INFO) << "Sync params from CPU to GPU";
+  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  place = platform::CUDAPlace(argument->gpu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+  // We get all the vars from local_scope instead of the ProgramDesc.
+  // Because there exists the case that new parameter variables are not added to
+  // the program in the analysis pass.
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE(var != nullptr);
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+      // Copy the parameter data to a tmp tensor.
+      TensorCopySync(*t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
+      t->mutable_data<float>(place);
+      // Copy parameter data to newly allocated GPU space.
+      TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+std::string IrParamsSyncAmongDevicesPass::repr() const {
+  return "ir-params-sync-among-devices-pass";
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+/*
+ * Sync parameter from CPU to GPU.
+ */
+class IrParamsSyncAmongDevicesPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+  std::string repr() const override;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 namespace paddle {
 namespace inference {
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
  passes_.emplace("ir_analysis_compose_pass",
                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace(
+      "ir_params_sync_among_devices_pass",
+      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
-    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
-    // When running in GPU mode, the parameters are all on GPU. But the
-    // opearations of "conv_bn_fuse_pass" are on CPU.
    passes_.assign({
-        "infer_clean_graph_pass",
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
-        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
    });
  }

--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
  ss << batch_size_ << "\t";
  ss << num_threads_ << "\t";
  ss << latency_ << "\t";
-  ss << 1000 / latency_;
+  ss << 1000.0 / latency_;
  ss << '\n';
  return ss.str();
 }

--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 #include <fstream>
 #include <iostream>
+#include <string>
 namespace paddle {
 namespace inference {
@@ -31,8 +33,8 @@ struct Benchmark {
  bool use_gpu() const { return use_gpu_; }
  void SetUseGpu() { use_gpu_ = true; }
-  int latency() const { return latency_; }
+  float latency() const { return latency_; }
-  void SetLatency(int x) { latency_ = x; }
+  void SetLatency(float x) { latency_ = x; }
  const std::string& name() const { return name_; }
  void SetName(const std::string& name) { name_ = name; }
@@ -43,7 +45,7 @@ struct Benchmark {
 private:
  bool use_gpu_{false};
  int batch_size_{0};
-  int latency_;
+  float latency_;
  int num_threads_{1};
  std::string name_;
 };