未验证 提交 89d09e65 编写于 作者: T tensor-tang 提交者: GitHub

Merge branch 'develop' into fea/ut/vis

...@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl( ...@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in})); desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in})); desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in})); desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Out", std::vector<std::string>({conv_relu_out})); desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d"); desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) { for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second); desc.SetAttr(attr.first, attr.second);
......
...@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", // "seq_concat_fc_fuse_pass", //
"fc_fuse_pass", // "fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", //
#endif
}}; }};
std::unordered_set<std::string> disabled_ir_passes_; std::unordered_set<std::string> disabled_ir_passes_;
......
...@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { ...@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
} }
void PrintTime(int batch_size, int repeat, int num_threads, int tid, void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) { double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid << ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======"; << ", latency: " << latency << "ms ======";
if (epoch > 1) {
int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples
<< ", average latency of each sample: " << latency / samples
<< "ms ======";
}
} }
} // namespace inference } // namespace inference
......
...@@ -51,12 +51,10 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc ...@@ -51,12 +51,10 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
# text_classification # text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
--topn=1 # Just run top 1 batch.
)
# ocr # ocr
set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz") set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
......
...@@ -12,21 +12,7 @@ ...@@ -12,21 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path, ...@@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size, const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data, const int repeat, bool test_all_data,
bool use_analysis = false) { bool use_analysis = false) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) {
AnalysisConfig cfg; AnalysisConfig cfg;
cfg.model_dir = model_path; cfg.model_dir = model_path;
cfg.use_gpu = false; cfg.use_gpu = false;
cfg.device = 0; cfg.device = 0;
cfg.specify_input_name = true; cfg.specify_input_name = true;
cfg.enable_ir_optim = true; cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) {
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
for (int i = 0; i < FLAGS_burning; i++) { for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots); predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; Timer timer;
if (test_all_data) { if (FLAGS_test_all_data) {
double sum = 0; LOG(INFO) << "test all data";
LOG(INFO) << "Total number of samples: " << data.datasets.size(); std::vector<std::vector<PaddleTensor>> input_slots_all;
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size); GetOneBatch(&input_slots, &data, batch_size);
timer.tic(); input_slots_all.emplace_back(input_slots);
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
} }
PrintTime(batch_size, repeat, 1, 0, sum / repeat); LOG(INFO) << "total number of samples: " << data.datasets.size();
LOG(INFO) << "Average latency of each sample: " TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
<< sum / repeat / data.datasets.size() << " ms";
return; return;
} }
timer.tic(); timer.tic();
...@@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path, ...@@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path,
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); CompareResult(ref_outputs_slots, outputs_slots);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
......
...@@ -12,20 +12,7 @@ ...@@ -12,20 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -113,17 +100,6 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, ...@@ -113,17 +100,6 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25}; 48, 39, 38, 16, 25};
void TestChineseNERPrediction(bool use_analysis) { void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
AnalysisConfig cfg; AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param"; cfg.param_file = FLAGS_infer_model + "/param";
...@@ -131,32 +107,28 @@ void TestChineseNERPrediction(bool use_analysis) { ...@@ -131,32 +107,28 @@ void TestChineseNERPrediction(bool use_analysis) {
cfg.device = 0; cfg.device = 0;
cfg.specify_input_name = true; cfg.specify_input_name = true;
cfg.enable_ir_optim = true; cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
if (FLAGS_test_all_data) { if (FLAGS_test_all_data) {
LOG(INFO) << "test all data"; LOG(INFO) << "test all data";
double sum = 0;
size_t num_samples;
for (int i = 0; i < FLAGS_repeat; i++) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
// Just one batch, the num_samples remains the same. std::vector<std::vector<PaddleTensor>> input_slots_all;
num_samples = data.num_samples; for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic(); input_slots_all.emplace_back(input_slots);
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
} }
LOG(INFO) << "total number of samples: " << num_samples; LOG(INFO) << "total number of samples: " << data.num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
return; return;
} }
// Prepare inputs. // Prepare inputs.
...@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { ...@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); CompareResult(ref_outputs_slots, outputs);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
......
...@@ -12,24 +12,7 @@ ...@@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model. // Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config; AnalysisConfig config;
...@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { ...@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
config.ir_passes.clear(); // Do not exclude any pass. config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor = auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
...@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { ...@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
base_predictor->Run(input_slots, &base_outputs); base_predictor->Run(input_slots, &base_outputs);
std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots);
if (num_threads == 1) { if (num_threads == 1) {
// Prepare inputs. TestOneThreadPrediction(config, input_slots_all, &outputs);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
CompareResult(outputs, base_outputs); CompareResult(outputs, base_outputs);
} else { } else {
std::vector<std::thread> threads; // only return the output of first thread
std::vector<std::unique_ptr<PaddlePredictor>> predictors; TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
} }
if (use_analysis && activate_ir) { if (use_analysis && activate_ir) {
...@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) { ...@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
// Directly infer with the original model. // Directly infer with the original model.
TestRNN1Prediction(false, false, i); TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the // Inference with the original model with the analysis turned on, the
// analysis // analysis module will transform the program to a data flow graph.
// module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i); TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large // Inference with analysis and IR. The IR module will fuse some large
// kernels. // kernels.
......
...@@ -12,23 +12,7 @@ ...@@ -12,23 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -37,24 +21,25 @@ struct DataReader { ...@@ -37,24 +21,25 @@ struct DataReader {
explicit DataReader(const std::string &path) explicit DataReader(const std::string &path)
: file(new std::ifstream(path)) {} : file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) { bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1); PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line; std::string line;
tensor->lod.clear(); PaddleTensor tensor;
tensor->lod.emplace_back(std::vector<size_t>({0})); tensor.dtype = PaddleDType::INT64;
tensor.lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data; std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false; if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data); inference::split_to_int64(line, ' ', &data);
} }
tensor->lod.front().push_back(data.size()); tensor.lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t)); tensor.data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear(); tensor.shape.push_back(data.size());
tensor->shape.push_back(data.size()); tensor.shape.push_back(1);
tensor->shape.push_back(1); input->assign({tensor});
return true; return true;
} }
...@@ -68,32 +53,28 @@ void Main(int batch_size) { ...@@ -68,32 +53,28 @@ void Main(int batch_size) {
config.model_dir = FLAGS_infer_model; config.model_dir = FLAGS_infer_model;
config.use_gpu = false; config.use_gpu = false;
config.enable_ir_optim = true; config.enable_ir_optim = true;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1); std::vector<PaddleTensor> input_slots, output_slots;
// one batch starts DataReader reader(FLAGS_infer_data);
// data -- std::vector<std::vector<PaddleTensor>> input_slots_all;
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
int num_batches = 0; int num_batches = 0;
for (int t = 0; t < FLAGS_repeat; t++) { while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
DataReader reader(FLAGS_infer_data); input_slots_all.emplace_back(input_slots);
while (reader.NextBatch(&input, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
++num_batches; ++num_batches;
} }
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
} }
PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
// one batch starts
// data --
reader.NextBatch(&input_slots, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
// Get output // Get output
LOG(INFO) << "get outputs " << output_slots.size(); LOG(INFO) << "get outputs " << output_slots.size();
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs);
}
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
inputs.size());
}
void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
std::vector<PaddleTensor> outputs_tid;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs_tid.size(); j++) {
predictors[tid]->Run(inputs_tid[j], &outputs_tid);
}
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times, inputs_tid.size());
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs);
} else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads);
}
}
} // namespace inference
} // namespace paddle
...@@ -302,8 +302,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -302,8 +302,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool fuse_relu = ctx.Attr<bool>("fuse_relu"); bool fuse_relu = ctx.Attr<bool>("fuse_relu");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation // TODO: add support for dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE( PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
...@@ -314,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -314,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives // Get unique name for storing MKLDNN primitives
...@@ -327,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -327,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format()); {weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -340,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -340,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(),
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation. std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr. // Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
......
cc_library(stringpiece SRCS piece.cc) cc_library(stringpiece SRCS piece.cc)
cc_library(pretty_log SRCS pretty_log.cc) cc_library(pretty_log SRCS pretty_log.cc)
cc_test(test_pretty_log SRCS pretty_log.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc) cc_test(to_string_test SRCS to_string_test.cc)
...@@ -50,6 +50,7 @@ function(py_test_modules TARGET_NAME) ...@@ -50,6 +50,7 @@ function(py_test_modules TARGET_NAME)
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
...@@ -65,11 +66,12 @@ if(WITH_DISTRIBUTE) ...@@ -65,11 +66,12 @@ if(WITH_DISTRIBUTE)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册