未验证 提交 141b2854 编写于 作者: J jianghaicheng 提交者: GitHub

ipu_inference (#37102)

* add ipu_inference

* resovle commments

* resolve comments

* add EnableIpu introduction

* rm line

* restore npu update

* add ernie and resnet50 test

* fix copyright time
Co-authored-by: Nyaozhixin <522190855@qq.com>
上级 ab6daf84
......@@ -13,14 +13,12 @@
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace paddle {
namespace framework {
......
......@@ -49,6 +49,9 @@ if(WIN32 AND WITH_GPU)
cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
else()
create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
if(WITH_IPU)
target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
endif()
endif()
if(NOT APPLE)
......
......@@ -273,6 +273,14 @@ struct Argument {
DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
int);
// ipu related
DECL_ARGUMENT_FIELD(use_ipu, UseIpu, bool);
DECL_ARGUMENT_FIELD(ipu_device_num, IpuDeviceNum, int);
DECL_ARGUMENT_FIELD(ipu_enable_pipelining, IpuEnablePipelining, bool);
DECL_ARGUMENT_FIELD(ipu_batches_per_step, IpuBatchesPerStep, int);
DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);
private:
std::unordered_set<std::string> valid_fields_;
};
......
......@@ -65,6 +65,27 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
platform::errors::PreconditionNotMet(
"The scope ptr should not be nullptr."));
argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
// ipu related
#ifdef PADDLE_WITH_IPU
if (argument->Has("use_ipu")) {
if (argument->use_ipu()) {
argument->main_graph().SetNotOwned("num_ipus",
&argument->ipu_device_num());
argument->main_graph().SetNotOwned("need_avg_shard",
&argument->ipu_need_avg_shard());
argument->main_graph().SetNotOwned("enable_pipelining",
&argument->ipu_enable_pipelining());
argument->main_graph().SetNotOwned("batches_per_step",
&argument->ipu_batches_per_step());
argument->main_graph().SetNotOwned("batch_size",
&argument->ipu_batch_size());
} else {
PADDLE_THROW(
platform::errors::Unimplemented("Please compile with WITH_IPU"));
}
}
#endif
}
std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
......
......@@ -46,6 +46,9 @@ PassStrategy *AnalysisConfig::pass_builder() const {
pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy);
} else if (use_ipu_) {
LOG(INFO) << "Create IPU IR passes";
pass_builder_.reset(new IpuPassStrategy);
} else {
LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy);
......@@ -139,6 +142,20 @@ void AnalysisConfig::EnableNpu(int device_id) {
Update();
}
void AnalysisConfig::EnableIpu(int device_num, bool ipu_enable_pipelining,
int ipu_batches_per_step, int ipu_batch_size,
bool ipu_need_avg_shard) {
enable_ir_optim_ = true;
use_ipu_ = true;
ipu_device_num_ = device_num;
ipu_enable_pipelining_ = ipu_enable_pipelining;
ipu_batches_per_step_ = ipu_batches_per_step;
ipu_batch_size_ = ipu_batch_size;
ipu_need_avg_shard_ = ipu_need_avg_shard;
Update();
}
AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__;
......@@ -233,12 +250,23 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(thread_local_stream_);
// ipu related
CP_MEMBER(use_ipu_);
CP_MEMBER(ipu_device_num_);
CP_MEMBER(ipu_enable_pipelining_);
CP_MEMBER(ipu_batches_per_step_);
CP_MEMBER(ipu_batch_size_);
CP_MEMBER(ipu_need_avg_shard_);
if (use_gpu_) {
PADDLE_ENFORCE_EQ(use_xpu_, false,
platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(other.pass_builder())));
} else if (use_ipu_) {
pass_builder_.reset(new IpuPassStrategy(
*static_cast<IpuPassStrategy *>(other.pass_builder())));
} else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(other.pass_builder())));
......@@ -413,7 +441,8 @@ void AnalysisConfig::Update() {
// Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
((use_xpu() ^ pass_builder_->use_xpu())) ||
((use_npu() ^ pass_builder_->use_npu()))) {
((use_npu() ^ pass_builder_->use_npu())) ||
((use_ipu() ^ pass_builder_->use_ipu()))) {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy);
......@@ -421,6 +450,9 @@ void AnalysisConfig::Update() {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
} else if (use_ipu()) {
VLOG(1) << "IpuPassStrategy has been used for new.";
pass_builder_.reset(new IpuPassStrategy);
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
......@@ -441,6 +473,10 @@ void AnalysisConfig::Update() {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else if (use_ipu()) {
VLOG(1) << "IpuPassStrategy has been used.";
pass_builder_.reset(new IpuPassStrategy(
*static_cast<IpuPassStrategy *>(pass_builder_.get())));
} else if (use_xpu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
......@@ -565,6 +601,13 @@ void AnalysisConfig::Update() {
"with NPU-runtime."));
#endif
}
if (use_ipu_) {
#ifndef PADDLE_WITH_IPU
PADDLE_THROW(platform::errors::Unavailable(
"You tried to enable the ipu "
"but did not have the option -DWITH_IPU compiled."));
#endif
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
......@@ -635,6 +678,13 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << thread_local_stream_;
ss << use_ipu_;
ss << ipu_device_num_;
ss << ipu_enable_pipelining_;
ss << ipu_batches_per_step_;
ss << ipu_batch_size_;
ss << ipu_need_avg_shard_;
return ss.str();
}
......
......@@ -110,6 +110,14 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
pt.data.length());
} else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
std::memcpy(static_cast<void *>(input_ptr), pt.data.data(),
pt.data.length());
#else
PADDLE_THROW(paddle::platform::errors::Fatal(
"Not compile with WITH_IPU, should not reach here."));
#endif
} else if (platform::is_gpu_place(place)) {
PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), false,
platform::errors::InvalidArgument(
......@@ -294,6 +302,14 @@ bool AnalysisPredictor::CreateExecutor() {
"engine), but Paddle was not compiled "
"with LITE_WITH_NNADAPTER."));
}
} else if (config_.use_ipu()) {
#ifdef PADDLE_WITH_IPU
place_ = paddle::platform::IPUPlace();
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use IPU forward propagation, but Paddle was not compiled "
"with WITH_IPU."));
#endif
} else {
place_ = paddle::platform::CPUPlace();
}
......@@ -643,6 +659,13 @@ void AnalysisPredictor::PrepareArgument() {
LOG(INFO) << "Lite subgraph engine is enabled";
}
argument_.SetUseIpu(config_.use_ipu_);
argument_.SetIpuDeviceNum(config_.ipu_device_num());
argument_.SetIpuEnablePipelining(config_.ipu_enable_pipelining_);
argument_.SetIpuBatchesPerStep(config_.ipu_batches_per_step_);
argument_.SetIpuBatchSize(config_.ipu_batch_size_);
argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);
if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
......@@ -916,6 +939,10 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_ipu_place(place_)) {
// Currently, IPUPlace's tensor copy between cpu and ipu has been set in
// IpuBackend.
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
if (config_.lite_engine_enabled()) {
// Currently, Paddle-Lite's XPU user interface only supports the transfer
......@@ -951,6 +978,10 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
res->SetName(name);
if (platform::is_cpu_place(place_)) {
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_ipu_place(place_)) {
// Currently, IPUPlace's tensor copy between cpu and ipu has been set in
// IpuBackend.
res->SetPlace(PaddlePlace::kCPU);
} else if (platform::is_xpu_place(place_)) {
if (config_.lite_engine_enabled()) {
// Currently, Paddle-Lite's XPU user interface only supports the transfer
......
......@@ -239,6 +239,14 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#else
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#endif
} else if (paddle::platform::is_ipu_place(t_place)) {
#ifdef PADDLE_WITH_IPU
std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with IPU place because paddle is not compiled "
"with IPU."));
#endif
} else if (place_ == PlaceType::kGPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -230,6 +230,24 @@ struct PD_INFER_DECL AnalysisConfig {
bool autotune = true, const std::string& autotune_file = "",
const std::string& precision = "int16",
bool adaptive_seqlen = false);
///
/// \brief Turn on IPU.
///
/// \param device_num The number of IPUs.
/// \param ipu_enable_pipelining Enable data pipelining between subgraphs,
/// each subgraph is settled on an IPU. (This feature requires the number of
/// IPUs > 1.)
/// \param ipu_batches_per_step The number of micro_batch_size per run. (This
/// feature requires to enable pipelining.)
/// \param ipu_batch_size The micro_batch_size which is the batch_size in the
/// graph.
/// \param ipu_need_avg_shard Enable the auto graph sharding. (This feature
/// requires the number of IPUs > 1.)
///
void EnableIpu(int device_num = 1, bool ipu_enable_pipelining = false,
int ipu_batches_per_step = 1, int ipu_batch_size = 1,
bool ipu_need_avg_shard = false);
///
/// \brief Set XPU device id.
///
......@@ -260,6 +278,11 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return bool Whether the NPU is turned on.
///
bool use_npu() const { return use_npu_; }
/// \brief A boolean state telling whether the IPU is turned on.
///
/// \return bool Whether the IPU is turned on.
///
bool use_ipu() const { return use_ipu_; }
///
/// \brief Get the GPU device id.
///
......@@ -278,6 +301,11 @@ struct PD_INFER_DECL AnalysisConfig {
/// \return int The NPU device id.
///
int npu_device_id() const { return npu_device_id_; }
/// \brief Get the the number of IPU device .
///
/// \return int The number of IPU device.
///
int ipu_device_num() const { return ipu_device_num_; }
///
/// \brief Get the initial size in MB of the GPU memory pool.
///
......@@ -840,6 +868,15 @@ struct PD_INFER_DECL AnalysisConfig {
bool use_mkldnn_bfloat16_{false};
std::unordered_set<std::string> bfloat16_enabled_op_types_;
// ipu related.
bool use_ipu_{false};
int ipu_device_num_{1};
bool ipu_enable_pipelining_{false};
int ipu_batches_per_step_{1};
int ipu_batch_size_{1};
bool ipu_need_avg_shard_{false};
// If the config is already used on a predictor, it becomes invalid.
// Any config can only be used with one predictor.
// Variables held by config can take up a lot of memory in some cases.
......
......@@ -300,4 +300,8 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
#endif
}
IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) {
passes_.assign({"inference_process_pass"});
}
} // namespace paddle
......@@ -148,6 +148,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in npu mode.
bool use_npu() const { return use_npu_; }
/// \brief Check if we are using ipu.
/// \return A bool variable implying whether we are in ipu mode.
bool use_ipu() const { return use_ipu_; }
/// \brief Default destructor.
virtual ~PassStrategy() = default;
......@@ -156,6 +160,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
bool use_xpu_{false};
bool use_gpu_{false};
bool use_npu_{false};
bool use_ipu_{false};
bool use_mkldnn_{false};
/// \endcond
};
......@@ -259,6 +264,22 @@ class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
}
};
/// \class IpuPassStrategy
/// \brief The IPU passes controller, it is used in AnalysisPredictor with IPU
/// mode.
class PD_INFER_DECL IpuPassStrategy final : public PassStrategy {
public:
/// \brief Default constructor of IpuPassStrategy.
IpuPassStrategy();
/// \brief Construct by copying another IpuPassStrategy object.
/// \param[in] other The IpuPassStrategy object we want to copy.
explicit IpuPassStrategy(const IpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
use_ipu_ = true;
}
};
/// \brief List of tensorRT subgraph passes.
PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
......
......@@ -757,3 +757,12 @@ endif()
if(ON_INFER OR WITH_GPU)
set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
endif()
# IPU
if (WITH_IPU)
#resnet50
set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=1000)
endif()
......@@ -122,5 +122,51 @@ TEST(Analyzer_Ernie, compare_results) {
}
}
#ifdef PADDLE_WITH_IPU
// IPU: Compare Deterministic result
TEST(Analyzer_Ernie_ipu, ipu_compare_determine) {
AnalysisConfig cfg;
SetIpuConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
}
// IPU: Compare results
TEST(Analyzer_Ernie_ipu, ipu_compare_results) {
AnalysisConfig cfg;
SetIpuConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
LoadInputData(&input_slots_all);
std::ifstream fin(FLAGS_refer_result);
std::string line;
std::vector<float> ref;
while (std::getline(fin, line)) {
Split(line, ' ', &ref);
}
auto predictor = CreateTestPredictor(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
FLAGS_use_analysis);
std::vector<PaddleTensor> outputs;
for (size_t i = 0; i < input_slots_all.size(); i++) {
outputs.clear();
predictor->Run(input_slots_all[i], &outputs);
auto outputs_size = outputs.front().data.length() / (sizeof(float));
for (size_t j = 0; j < outputs_size; ++j) {
EXPECT_NEAR(ref[i * outputs_size + j],
static_cast<float *>(outputs[0].data.data())[j],
FLAGS_accuracy);
}
}
}
#endif
} // namespace inference
} // namespace paddle
......@@ -148,5 +148,11 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
}
void SetIpuConfig(AnalysisConfig *cfg, int batch_size = 1) {
cfg->SetModel(FLAGS_infer_model);
// num_ipu, enable_pipelining, batches_per_step, batch_size, need_avg_shard
cfg->EnableIpu(4, false, 1, batch_size, true);
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
static std::vector<float> truth_values = {
127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f,
-633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
-242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f,
-133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f,
-316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f,
-447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f,
551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f,
-8.42969f, 131.365f, -648.051f};
// Compare results with 1 batch
TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) {
std::string model_dir = FLAGS_infer_model + "/" + "model";
AnalysisConfig config;
// num_ipu, enable_pipelining, batches_per_step, batch_size,
// need_avg_shard
config.EnableIpu(1, false);
config.SetModel(model_dir + "/model", model_dir + "/params");
std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
PaddleTensor in;
in.shape = {batch, channel, height, width};
in.data =
PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
in.dtype = PaddleDType::FLOAT32;
inputs.emplace_back(in);
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(inputs, &outputs));
const size_t expected_size = 1;
EXPECT_EQ(outputs.size(), expected_size);
float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) {
EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
12e-5);
}
}
// Compare results with 2 batch
TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) {
std::string model_dir = FLAGS_infer_model + "/" + "model";
AnalysisConfig config;
// num_ipu, enable_pipelining, batches_per_step, batch_size,
// need_avg_shard
config.EnableIpu(2, false, 1, 2, 1);
config.SetModel(model_dir + "/model", model_dir + "/params");
std::vector<PaddleTensor> inputs;
auto predictor = CreatePaddlePredictor(config);
const int batch = 2;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
PaddleTensor in;
in.shape = {batch, channel, height, width};
in.data =
PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
in.dtype = PaddleDType::FLOAT32;
inputs.emplace_back(in);
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(inputs, &outputs));
const size_t expected_size = 1;
EXPECT_EQ(outputs.size(), expected_size);
float* data_o = static_cast<float*>(outputs[0].data.data());
auto num_output_per_batch = outputs[0].data.length() / sizeof(float) / 2;
for (size_t j = 0; j < num_output_per_batch; j += 10) {
EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
12e-5);
EXPECT_NEAR((data_o[j + num_output_per_batch] - truth_values[j / 10]) /
truth_values[j / 10],
0., 12e-5);
}
}
} // namespace inference
} // namespace paddle
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册