From afe829f3ba08ab982931746cac06709eec59fc2f Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 29 Nov 2018 08:52:26 +0000 Subject: [PATCH] cherry-pick from #14649 Add params sync pass test=release/1.2 --- .../inference/analysis/passes/CMakeLists.txt | 3 +- .../passes/ir_analysis_compose_pass.cc | 1 + .../analysis/passes/ir_graph_build_pass.cc | 7 +- .../ir_params_sync_among_devices_pass.cc | 74 +++++++++++++++++++ .../ir_params_sync_among_devices_pass.h | 39 ++++++++++ .../fluid/inference/analysis/passes/passes.cc | 4 + .../fluid/inference/api/paddle_pass_builder.h | 6 +- paddle/fluid/inference/utils/benchmark.cc | 2 +- paddle/fluid/inference/utils/benchmark.h | 8 +- 9 files changed, 128 insertions(+), 16 deletions(-) create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index a30c27b118..d3ea511d8f 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,6 +1,7 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass) +cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass) set(analysis_deps ${analysis_deps} ir_graph_build_pass diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index 108cb6f74b..c3a2b3ca1d 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { std::vector passes({ "ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass", }); for (const auto &pass : passes) { VLOG(2) << "Run pass " << pass; diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index d5e0d90de1..740030c3a8 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { // so that the parameters will on the same device, or they will keep copying // between difference devices. platform::Place place; - if (argument->use_gpu()) { - PADDLE_ENFORCE(argument->gpu_device_id_valid()); - place = platform::CUDAPlace(argument->gpu_device_id()); - } else { - place = platform::CPUPlace(); - } + place = platform::CPUPlace(); if (argument->model_dir_valid()) { auto program = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc new file mode 100644 index 0000000000..8be2d3ac0b --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { + PADDLE_ENFORCE(argument->scope_valid()); + PADDLE_ENFORCE(argument->use_gpu_valid()); + + platform::Place place; + + // The parameters are on the cpu, therefore, synchronization is not necessary. + if (!argument->use_gpu()) return; + + LOG(INFO) << "Sync params from CPU to GPU"; + + PADDLE_ENFORCE(argument->gpu_device_id_valid()); + place = platform::CUDAPlace(argument->gpu_device_id()); + + auto *scope = argument->scope_ptr(); + std::vector all_vars = scope->LocalVarNames(); + + // We get all the vars from local_scope instead of the ProgramDesc. + // Because there exists the case that new parameter variables are not added to + // the program in the analysis pass. + for (auto &var_name : all_vars) { + auto *var = scope->FindLocalVar(var_name); + PADDLE_ENFORCE(var != nullptr); + if (var->IsType() || + var->IsType()) { + auto *t = var->GetMutable(); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + temp_tensor.mutable_data(cpu_place); + + // Copy the parameter data to a tmp tensor. + TensorCopySync(*t, cpu_place, &temp_tensor); + // Reallocation the space on GPU + t->mutable_data(place); + + // Copy parameter data to newly allocated GPU space. + TensorCopySync(temp_tensor, place, t); + } + } +} + +std::string IrParamsSyncAmongDevicesPass::repr() const { + return "ir-params-sync-among-devices-pass"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h new file mode 100644 index 0000000000..a95f460df6 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Sync parameter from CPU to GPU. + */ +class IrParamsSyncAmongDevicesPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 2ef515f45f..9245e32cee 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" namespace paddle { namespace inference { @@ -27,6 +28,9 @@ PassRegistry::PassRegistry() { std::unique_ptr(new IrGraphBuildPass)); passes_.emplace("ir_analysis_compose_pass", std::unique_ptr(new IrAnalysisComposePass)); + passes_.emplace( + "ir_params_sync_among_devices_pass", + std::unique_ptr(new IrParamsSyncAmongDevicesPass)); } } // namespace analysis diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 12e3a6f42e..825bee833b 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { - // TODO(NHZlX) Problem with Data synchronization between GPU and CPU - // When running in GPU mode, the parameters are all on GPU. But the - // opearations of "conv_bn_fuse_pass" are on CPU. passes_.assign({ - "infer_clean_graph_pass", - // "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", "conv_bn_fuse_pass", }); } diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc index 021edc2de5..d03aa11b75 100644 --- a/paddle/fluid/inference/utils/benchmark.cc +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const { ss << batch_size_ << "\t"; ss << num_threads_ << "\t"; ss << latency_ << "\t"; - ss << 1000 / latency_; + ss << 1000.0 / latency_; ss << '\n'; return ss.str(); } diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h index 80e8f77adb..76a3dd2c29 100644 --- a/paddle/fluid/inference/utils/benchmark.h +++ b/paddle/fluid/inference/utils/benchmark.h @@ -11,9 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include namespace paddle { namespace inference { @@ -31,8 +33,8 @@ struct Benchmark { bool use_gpu() const { return use_gpu_; } void SetUseGpu() { use_gpu_ = true; } - int latency() const { return latency_; } - void SetLatency(int x) { latency_ = x; } + float latency() const { return latency_; } + void SetLatency(float x) { latency_ = x; } const std::string& name() const { return name_; } void SetName(const std::string& name) { name_ = name; } @@ -43,7 +45,7 @@ struct Benchmark { private: bool use_gpu_{false}; int batch_size_{0}; - int latency_; + float latency_; int num_threads_{1}; std::string name_; }; -- GitLab