Merge branch 'develop' into fea/ut/vis

89d09e65 · tensor-tang · GitHub · 0bd6476f · d4a5326a · 89d09e65
12 changed file
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
+    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
    desc.SetType("conv2d");
    for (auto& attr : conv->Op()->GetAttrMap()) {
      desc.SetAttr(attr.first, attr.second);

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
      "mul_gru_fuse_pass",         //
      "seq_concat_fc_fuse_pass",   //
      "fc_fuse_pass",              //
+#ifdef PADDLE_WITH_MKLDNN
+      "conv_relu_mkldnn_fuse_pass",  //
+#endif
  }};

  std::unordered_set<std::string> disabled_ir_passes_;

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 }

 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency) {
+               double latency, int epoch = 1) {
  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
            << ", threads: " << num_threads << ", thread id: " << tid
            << ", latency: " << latency << "ms ======";
+  if (epoch > 1) {
+    int samples = batch_size * epoch;
+    LOG(INFO) << "====== sample number: " << samples
+              << ", average latency of each sample: " << latency / samples
+              << "ms ======";
+  }
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -51,12 +51,10 @@ inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
-inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
+inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
    EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
-         --topn=1 # Just run top 1 batch.
-         )
+         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)

 # ocr
 set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -12,21 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path for LAC");
-DEFINE_string(infer_data, "", "data file for LAC");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -126,46 +112,37 @@ void TestLACPrediction(const std::string &model_path,
                       const std::string &data_file, const int batch_size,
                       const int repeat, bool test_all_data,
                       bool use_analysis = false) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
+  AnalysisConfig cfg;
+  cfg.model_dir = model_path;
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;
+
  std::vector<PaddleTensor> input_slots, outputs_slots;
  DataRecord data(data_file, batch_size);
  GetOneBatch(&input_slots, &data, batch_size);
  std::unique_ptr<PaddlePredictor> predictor;
  if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.model_dir = model_path;
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
-  if (test_all_data) {
-    double sum = 0;
-    LOG(INFO) << "Total number of samples: " << data.datasets.size();
-    for (int i = 0; i < repeat; i++) {
-      for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-        GetOneBatch(&input_slots, &data, batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs_slots);
-        sum += timer.toc();
-      }
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      input_slots_all.emplace_back(input_slots);
    }
-    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-    LOG(INFO) << "Average latency of each sample: "
-              << sum / repeat / data.datasets.size() << " ms";
+    LOG(INFO) << "total number of samples: " << data.datasets.size();
+    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
    return;
  }
  timer.tic();
@@ -190,19 +167,10 @@ void TestLACPrediction(const std::string &model_path,
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], pdata[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs_slots);

    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -12,20 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -113,50 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                       48, 39, 38, 16, 25};

 void TestChineseNERPrediction(bool use_analysis) {
-  NativeConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
+  AnalysisConfig cfg;
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.param_file = FLAGS_infer_model + "/param";
+  cfg.use_gpu = false;
+  cfg.device = 0;
+  cfg.specify_input_name = true;
+  cfg.enable_ir_optim = true;

  std::vector<PaddleTensor> input_slots, outputs;
  std::unique_ptr<PaddlePredictor> predictor;
  Timer timer;
  if (use_analysis) {
-    AnalysisConfig cfg;
-    cfg.prog_file = FLAGS_infer_model + "/__model__";
-    cfg.param_file = FLAGS_infer_model + "/param";
-    cfg.use_gpu = false;
-    cfg.device = 0;
-    cfg.specify_input_name = true;
-    cfg.enable_ir_optim = true;
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }

  if (FLAGS_test_all_data) {
    LOG(INFO) << "test all data";
-    double sum = 0;
-    size_t num_samples;
-    for (int i = 0; i < FLAGS_repeat; i++) {
-      DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-      // Just one batch, the num_samples remains the same.
-      num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
-        PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs);
-        sum += timer.toc();
-      }
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
+      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+      input_slots_all.emplace_back(input_slots);
    }
-    LOG(INFO) << "total number of samples: " << num_samples;
-    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
-    LOG(INFO) << "average latency of each sample: "
-              << sum / FLAGS_repeat / num_samples;
+    LOG(INFO) << "total number of samples: " << data.num_samples;
+    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
    return;
  }
  // Prepare inputs.
@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], result[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs);

    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-    }
-  }
-}
 // Test with a really complicate model.
 void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  AnalysisConfig config;
@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  config.ir_passes.clear();  // Do not exclude any pass.

  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;

  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {

  base_predictor->Run(input_slots, &base_outputs);

+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  input_slots_all.emplace_back(input_slots);
  if (num_threads == 1) {
-    // Prepare inputs.
-    Timer timer;
-    timer.tic();
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(input_slots, &outputs);
-    }
-    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    TestOneThreadPrediction(config, input_slots_all, &outputs);
    CompareResult(outputs, base_outputs);
  } else {
-    std::vector<std::thread> threads;
-    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    // because AttentionLSTM's hard code nodeid will be damanged.
-    for (int tid = 0; tid < num_threads; ++tid) {
-      predictors.emplace_back(
-          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-              config));
-    }
-    for (int tid = 0; tid < num_threads; ++tid) {
-      threads.emplace_back([&, tid]() {
-        // Each thread should have local input_slots and outputs.
-        std::vector<PaddleTensor> input_slots;
-        DataRecord data(FLAGS_infer_data, batch_size);
-        PrepareInputs(&input_slots, &data, batch_size);
-        std::vector<PaddleTensor> outputs;
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          predictors[tid]->Run(input_slots, &outputs);
-        }
-        PrintTime(batch_size, num_times, num_threads, tid,
-                  timer.toc() / num_times);
-        CompareResult(outputs, base_outputs);
-      });
-    }
-    for (int i = 0; i < num_threads; ++i) {
-      threads[i].join();
-    }
+    // only return the output of first thread
+    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
  }

  if (use_analysis && activate_ir) {
@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
    // Directly infer with the original model.
    TestRNN1Prediction(false, false, i);
    // Inference with the original model with the analysis turned on, the
-    // analysis
-    // module will transform the program to a data flow graph.
+    // analysis module will transform the program to a data flow graph.
    TestRNN1Prediction(true, false, i);
    // Inference with analysis and IR. The IR module will fuse some large
    // kernels.

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -12,23 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/api/timer.h"
-
-DEFINE_string(infer_model, "", "Directory of the inference model.");
-DEFINE_string(infer_data, "", "Path of the dataset.");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "How many times to repeat run.");
-DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -37,24 +21,25 @@ struct DataReader {
  explicit DataReader(const std::string &path)
      : file(new std::ifstream(path)) {}

-  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
    PADDLE_ENFORCE_EQ(batch_size, 1);
    std::string line;
-    tensor->lod.clear();
-    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    PaddleTensor tensor;
+    tensor.dtype = PaddleDType::INT64;
+    tensor.lod.emplace_back(std::vector<size_t>({0}));
    std::vector<int64_t> data;

    for (int i = 0; i < batch_size; i++) {
      if (!std::getline(*file, line)) return false;
      inference::split_to_int64(line, ' ', &data);
    }
-    tensor->lod.front().push_back(data.size());
+    tensor.lod.front().push_back(data.size());

-    tensor->data.Resize(data.size() * sizeof(int64_t));
-    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
-    tensor->shape.clear();
-    tensor->shape.push_back(data.size());
-    tensor->shape.push_back(1);
+    tensor.data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor.shape.push_back(data.size());
+    tensor.shape.push_back(1);
+    input->assign({tensor});
    return true;
  }

@@ -68,32 +53,28 @@ void Main(int batch_size) {
  config.model_dir = FLAGS_infer_model;
  config.use_gpu = false;
  config.enable_ir_optim = true;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-
-  std::vector<PaddleTensor> input_slots(1);
-  // one batch starts
-  // data --
-  auto &input = input_slots[0];
-  input.dtype = PaddleDType::INT64;

-  inference::Timer timer;
-  double sum = 0;
-  std::vector<PaddleTensor> output_slots;
+  std::vector<PaddleTensor> input_slots, output_slots;
+  DataReader reader(FLAGS_infer_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;

-  int num_batches = 0;
-  for (int t = 0; t < FLAGS_repeat; t++) {
-    DataReader reader(FLAGS_infer_data);
-    while (reader.NextBatch(&input, FLAGS_batch_size)) {
-      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
-      timer.tic();
-      CHECK(predictor->Run(input_slots, &output_slots));
-      sum += timer.toc();
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    int num_batches = 0;
+    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+      input_slots_all.emplace_back(input_slots);
      ++num_batches;
    }
+    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+    return;
  }
-  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+
+  // one batch starts
+  // data --
+  reader.NextBatch(&input_slots, FLAGS_batch_size);
+  input_slots_all.emplace_back(input_slots);
+  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);

  // Get output
  LOG(INFO) << "get outputs " << output_slots.size();

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data file");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
+
+void TestOneThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], outputs);
+    }
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
+            inputs.size());
+}
+
+void TestMultiThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs, int num_threads) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+  // because AttentionLSTM's hard code nodeid will be damanged.
+  for (int tid = 0; tid < num_threads; ++tid) {
+    predictors.emplace_back(
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+            config));
+  }
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // Each thread should have local inputs and outputs.
+      // The inputs of each thread are all the same.
+      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
+      std::vector<PaddleTensor> outputs_tid;
+      Timer timer;
+      timer.tic();
+      for (int i = 0; i < num_times; i++) {
+        for (size_t j = 0; j < inputs_tid.size(); j++) {
+          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        }
+      }
+      PrintTime(batch_size, num_times, num_threads, tid,
+                timer.toc() / num_times, inputs_tid.size());
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+void TestPrediction(AnalysisConfig config,
+                    const std::vector<std::vector<PaddleTensor>> inputs,
+                    std::vector<PaddleTensor> *outputs, int num_threads) {
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, inputs, outputs);
+  } else {
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -302,8 +302,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
    int groups = ctx.Attr<int>("groups");

-    // TODO(pzelazko-intel) add support for group convolution and dilation
-    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    // TODO: add support for dilation
    PADDLE_ENFORCE(
        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
        "dilation in convolution is not implemented yet");
@@ -314,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());

    // Get unique name for storing MKLDNN primitives
@@ -327,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_src_md = platform::MKLDNNMemDesc(
        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);

    /* create memory descriptor for convolution without specified format
     * ('any') which lets a primitive (convolution in this case) choose
@@ -340,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto src_md = platform::MKLDNNMemDesc(
        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                               // Currently used whenever bias is != nullptr.
    auto dst_md = platform::MKLDNNMemDesc(

--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
 cc_library(stringpiece SRCS piece.cc)
 cc_library(pretty_log SRCS pretty_log.cc)
-cc_test(test_pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -50,6 +50,7 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
@@ -65,11 +66,12 @@ if(WITH_DISTRIBUTE)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
+    py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
-py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
-py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)