提交 afe829f3 编写于 作者: N nhzlx

cherry-pick from #14649

Add params sync pass

test=release/1.2
上级 25c2cdaf
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
set(analysis_deps ${analysis_deps} set(analysis_deps ${analysis_deps}
ir_graph_build_pass ir_graph_build_pass
......
...@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { ...@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({ std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass", "ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass",
}); });
for (const auto &pass : passes) { for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass; VLOG(2) << "Run pass " << pass;
......
...@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { ...@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
// so that the parameters will on the same device, or they will keep copying // so that the parameters will on the same device, or they will keep copying
// between difference devices. // between difference devices.
platform::Place place; platform::Place place;
if (argument->use_gpu()) { place = platform::CPUPlace();
PADDLE_ENFORCE(argument->gpu_device_id_valid());
place = platform::CUDAPlace(argument->gpu_device_id());
} else {
place = platform::CPUPlace();
}
if (argument->model_dir_valid()) { if (argument->model_dir_valid()) {
auto program = auto program =
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace analysis {
void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE(argument->scope_valid());
PADDLE_ENFORCE(argument->use_gpu_valid());
platform::Place place;
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid());
place = platform::CUDAPlace(argument->gpu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();
// We get all the vars from local_scope instead of the ProgramDesc.
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>();
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);
// Copy the parameter data to a tmp tensor.
TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->mutable_data<float>(place);
// Copy parameter data to newly allocated GPU space.
TensorCopySync(temp_tensor, place, t);
}
}
}
std::string IrParamsSyncAmongDevicesPass::repr() const {
return "ir-params-sync-among-devices-pass";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Sync parameter from CPU to GPU.
*/
class IrParamsSyncAmongDevicesPass : public AnalysisPass {
public:
void RunImpl(Argument *argument) override;
std::string repr() const override;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() { ...@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
std::unique_ptr<AnalysisPass>(new IrGraphBuildPass)); std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
passes_.emplace("ir_analysis_compose_pass", passes_.emplace("ir_analysis_compose_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass)); std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
passes_.emplace(
"ir_params_sync_among_devices_pass",
std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
} }
} // namespace analysis } // namespace analysis
......
...@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy { ...@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
class GpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy {
public: public:
GpuPassStrategy() : PassStrategy({}) { GpuPassStrategy() : PassStrategy({}) {
// TODO(NHZlX) Problem with Data synchronization between GPU and CPU
// When running in GPU mode, the parameters are all on GPU. But the
// opearations of "conv_bn_fuse_pass" are on CPU.
passes_.assign({ passes_.assign({
"infer_clean_graph_pass", "infer_clean_graph_pass", "conv_bn_fuse_pass",
// "infer_clean_graph_pass", "conv_bn_fuse_pass",
}); });
} }
......
...@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const { ...@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
ss << batch_size_ << "\t"; ss << batch_size_ << "\t";
ss << num_threads_ << "\t"; ss << num_threads_ << "\t";
ss << latency_ << "\t"; ss << latency_ << "\t";
ss << 1000 / latency_; ss << 1000.0 / latency_;
ss << '\n'; ss << '\n';
return ss.str(); return ss.str();
} }
......
...@@ -11,9 +11,11 @@ ...@@ -11,9 +11,11 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <string>
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -31,8 +33,8 @@ struct Benchmark { ...@@ -31,8 +33,8 @@ struct Benchmark {
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
void SetUseGpu() { use_gpu_ = true; } void SetUseGpu() { use_gpu_ = true; }
int latency() const { return latency_; } float latency() const { return latency_; }
void SetLatency(int x) { latency_ = x; } void SetLatency(float x) { latency_ = x; }
const std::string& name() const { return name_; } const std::string& name() const { return name_; }
void SetName(const std::string& name) { name_ = name; } void SetName(const std::string& name) { name_ = name; }
...@@ -43,7 +45,7 @@ struct Benchmark { ...@@ -43,7 +45,7 @@ struct Benchmark {
private: private:
bool use_gpu_{false}; bool use_gpu_{false};
int batch_size_{0}; int batch_size_{0};
int latency_; float latency_;
int num_threads_{1}; int num_threads_{1};
std::string name_; std::string name_;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册