merge develop

62c9ed11 · dongshuilong · 8aed9831 · f656733f · 62c9ed11 · 62c9ed11
67 changed file
--- a/README_ch.md
+++ b/README_ch.md
@@ -38,7 +38,7 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
 * 您可以扫描下面的微信群二维码， 加入PaddleClas 微信交流群。获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。

 <div align="center">
-<img src="https://user-images.githubusercontent.com/12560511/156726948-9b5d6de9-f526-4032-8ee3-eec0ca16e52e.jpg" width="200"/>
+<img src="https://user-images.githubusercontent.com/12560511/158047154-6c7418fa-3705-492c-9cdc-255b13eeffad.jpg" width="200"/>
 </div>

 ## 快速体验

--- a/README_en.md
+++ b/README_en.md
@@ -41,7 +41,7 @@ Four sample solutions are provided, including product recognition, vehicle recog
 * You can also scan the QR code below to join the PaddleClas WeChat group to get more efficient answers to your questions and to communicate with developers from all walks of life. We look forward to hearing from you.

 <div align="center">
-<img src="https://user-images.githubusercontent.com/12560511/156726948-9b5d6de9-f526-4032-8ee3-eec0ca16e52e.jpg" width="200"/>
+<img src="https://user-images.githubusercontent.com/12560511/158047154-6c7418fa-3705-492c-9cdc-255b13eeffad.jpg" width="200"/>
 </div>

 ## Quick Start

--- a/deploy/lite_shitu/Makefile
+++ b/deploy/lite_shitu/Makefile
@@ -17,7 +17,6 @@ ${info LITE_ROOT: $(abspath ${LITE_ROOT})}
 THIRD_PARTY_DIR=third_party
 ${info THIRD_PARTY_DIR: $(abspath ${THIRD_PARTY_DIR})}

-
 OPENCV_VERSION=opencv4.1.0
 OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgcodecs.a \
              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_imgproc.a \
@@ -32,6 +31,8 @@ OPENCV_LIBS = ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/libs/libopencv_im
              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libtbb.a \
              ${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/3rdparty/libs/libcpufeatures.a

+FAISS_VERSION=faiss1.5.3
+FAISS_LIBS = ${THIRD_PARTY_DIR}/${FAISS_VERSION}/libs/${ARM_PLAT}/libfaiss.a

 LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
 ###############################################################
@@ -45,7 +46,7 @@ LITE_LIBS = -L${LITE_ROOT}/cxx/lib/ -lpaddle_light_api_shared
 # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
 # LITE_LIBS = ${LITE_ROOT}/cxx/lib/libpaddle_api_light_bundled.a

-CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} $(SYSTEM_LIBS)
+CXX_LIBS = $(LITE_LIBS) ${OPENCV_LIBS} ${FAISS_LIBS} $(SYSTEM_LIBS)

 LOCAL_DIRSRCS=$(wildcard src/*.cc)
 LOCAL_SRCS=$(notdir $(LOCAL_DIRSRCS))
@@ -53,9 +54,17 @@ LOCAL_OBJS=$(patsubst %.cpp, %.o, $(patsubst %.cc, %.o, $(LOCAL_SRCS)))

 JSON_OBJS = json_reader.o json_value.o json_writer.o

-pp_shitu: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv
+pp_shitu: $(LOCAL_OBJS) $(JSON_OBJS) fetch_opencv fetch_faiss
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) $(LOCAL_OBJS) $(JSON_OBJS) -o pp_shitu $(CXX_LIBS) $(LDFLAGS)

+fetch_faiss:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${FAISS_VERSION}.tar.gz || \
+      (echo "fetch faiss libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${FAISS_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${FAISS_VERSION} || \
+      tar -xf ${THIRD_PARTY_DIR}/${FAISS_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
 	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
@@ -74,11 +83,12 @@ fetch_json_code:

 LOCAL_INCLUDES = -I./ -Iinclude
 OPENCV_INCLUDE = -I${THIRD_PARTY_DIR}/${OPENCV_VERSION}/${ARM_PLAT}/include
+FAISS_INCLUDE = -I${THIRD_PARTY_DIR}/${FAISS_VERSION}/include
 JSON_INCLUDE = -I${THIRD_PARTY_DIR}/jsoncpp_code/include
-CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include
+CXX_INCLUDES = ${LOCAL_INCLUDES} ${INCLUDES} ${OPENCV_INCLUDE} ${FAISS_INCLUDE} ${JSON_INCLUDE} -I$(LITE_ROOT)/cxx/include


-$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code
+$(LOCAL_OBJS): %.o: src/%.cc fetch_opencv fetch_json_code fetch_faiss
 	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -c $< -o $@

 $(JSON_OBJS): %.o: ${THIRD_PARTY_DIR}/jsoncpp_code/%.cpp fetch_json_code

--- a/deploy/lite_shitu/README.md
+++ b/deploy/lite_shitu/README.md
@@ -191,16 +191,28 @@ cd deploy/lite_shitu

 ```shell
 # 如果测试单张图像
-python generate_json_config.py --det_model_path ppshitu_lite_models_v1.0/mainbody_PPLCNet_x2_5_640_quant_v1.0_lite.nb  --rec_model_path ppshitu_lite_models_v1.0/general_PPLCNet_x2_5_quant_v1.0_lite.nb --rec_label_path ppshitu_lite_models_v1.0/label.txt --img_path images/demo.jpg
+python generate_json_config.py --det_model_path ppshitu_lite_models_v1.0/mainbody_PPLCNet_x2_5_640_quant_v1.0_lite.nb  --rec_model_path ppshitu_lite_models_v1.0/general_PPLCNet_x2_5_lite_v1.0_infer.nb --img_path images/demo.jpg
 # or
 # 如果测试多张图像
-python generate_json_config.py --det_model_path ppshitu_lite_models_v1.0/mainbody_PPLCNet_x2_5_640_quant_v1.0_lite.nb  --rec_model_path ppshitu_lite_models_v1.0/general_PPLCNet_x2_5_quant_v1.0_lite.nb --rec_label_path ppshitu_lite_models_v1.0/label.txt --img_dir images
-
+python generate_json_config.py --det_model_path ppshitu_lite_models_v1.0/mainbody_PPLCNet_x2_5_640_quant_v1.0_lite.nb  --rec_model_path ppshitu_lite_models_v1.0/general_PPLCNet_x2_5_lite_v1.0_infer.nb --img_dir images
 # 执行完成后，会在lit_shitu下生成shitu_config.json配置文件
+```
+
+### 2.3 index字典转换
+由于python的检索库字典，使用`pickle`进行的序列化存储，导致C++不方便读取，因此需要进行转换
+
+```shell
+# 下载瓶装饮料数据集
+wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/data/drink_dataset_v1.0.tar && tar -xf drink_dataset_v1.0.tar
+rm -rf drink_dataset_v1.0.tar

+# 转化id_map.pkl为id_map.txt
+python transform_id_map.py -c ../configs/inference_drink.yaml
 ```
+转换成功后，会在`IndexProcess.index_dir`目录下生成`id_map.txt`。

-### 2.3 与手机联调
+
+### 2.4 与手机联调

 首先需要进行一些准备工作。
 1. 准备一台arm8的安卓手机，如果编译的预测库是armv7，则需要arm7的手机，并修改Makefile中`ARM_ABI=arm7`。
@@ -252,6 +264,7 @@ make ARM_ABI=arm8
 ```shell
 mkdir deploy
 mv ppshitu_lite_models_v1.0 deploy/
+mv drink_dataset_v1.0 deploy/
 mv images deploy/
 mv shitu_config.json deploy/
 cp pp_shitu deploy/
@@ -265,12 +278,12 @@ cp ../../../cxx/lib/libpaddle_light_api_shared.so deploy/
 ```shell
 deploy/
 |-- ppshitu_lite_models_v1.0/
-|   |--mainbody_PPLCNet_x2_5_640_v1.0_lite.nb        优化后的主体检测模型文件
-|   |--general_PPLCNet_x2_5_quant_v1.0_lite.nb       优化后的识别模型文件
-|   |--label.txt                                     识别模型的label文件
+|   |--mainbody_PPLCNet_x2_5_lite_v1.0_infer.nb        优化后的主体检测模型文件
+|   |--general_PPLCNet_x2_5_quant_v1.0_lite.nb         优化后的识别模型文件
 |-- images/
 |   |--demo.jpg                                      图片文件
-|   ...                                              图片文件
+|-- drink_dataset_v1.0/                                瓶装饮料demo数据
+|   |--index                                         检索index目录
 |-- pp_shitu                                         生成的移动端执行文件
 |-- shitu_config.json                                执行时参数配置文件
 |-- libpaddle_light_api_shared.so                    Paddle-Lite库文件
@@ -298,8 +311,10 @@ chmod 777 pp_shitu
 如果对代码做了修改，则需要重新编译并push到手机上。

 运行效果如下：
-
-![](../../docs/images/ppshitu_lite_demo.png)
+```
+images/demo.jpg:
+        result0: bbox[253, 275, 1146, 872], score: 0.974196, label: 伊藤园_果蔬汁
+```

 ## FAQ
 Q1：如果想更换模型怎么办，需要重新按照流程走一遍吗？

--- a/deploy/lite_shitu/generate_json_config.py
+++ b/deploy/lite_shitu/generate_json_config.py
@@ -130,6 +130,8 @@ def main():
            y["type"] = k
            config_json["RecPreProcess"]["transform_ops"].append(y)

+    # set IndexProces
+    config_json["IndexProcess"] = config_yaml["IndexProcess"]
    with open('shitu_config.json', 'w') as fd:
        json.dump(config_json, fd, indent=4)


--- a/deploy/lite_shitu/include/recognition.h
+++ b/deploy/lite_shitu/include/recognition.h
@@ -36,10 +36,9 @@ struct RESULT {
  float score;
 };

-class Recognition {
-
+class FeatureExtract {
 public:
-  explicit Recognition(const Json::Value &config_file) {
+  explicit FeatureExtract(const Json::Value &config_file) {
    MobileConfig config;
    if (config_file["Global"]["rec_model_path"].as<std::string>().empty()) {
      std::cout << "Please set [rec_model_path] in config file" << std::endl;
@@ -53,29 +52,8 @@ public:
      std::cout << "Please set [rec_label_path] in config file" << std::endl;
      exit(-1);
    }
-    LoadLabel(config_file["Global"]["rec_label_path"].as<std::string>());
    SetPreProcessParam(config_file["RecPreProcess"]["transform_ops"]);
-    if (!config_file["Global"].isMember("return_k")){
-      this->topk = config_file["Global"]["return_k"].as<int>();
-    }
-    printf("rec model create!\n");
-  }
-
-  void LoadLabel(std::string path) {
-    std::ifstream file;
-    std::vector<std::string> label_list;
-    file.open(path);
-    while (file) {
-      std::string line;
-      std::getline(file, line);
-      std::string::size_type pos = line.find(" ");
-      if (pos != std::string::npos) {
-        line = line.substr(pos);
-      }
-      this->label_list.push_back(line);
-    }
-    file.clear();
-    file.close();
+    printf("feature extract model create!\n");
  }

  void SetPreProcessParam(const Json::Value &config_file) {
@@ -97,19 +75,17 @@ public:
    }
  }

-  std::vector<RESULT> RunRecModel(const cv::Mat &img, double &cost_time);
-  std::vector<RESULT> PostProcess(const float *output_data, int output_size,
-                                  cv::Mat &output_image);
+  void RunRecModel(const cv::Mat &img, double &cost_time, std::vector<float> &feature);
+  //void PostProcess(std::vector<float> &feature);
  cv::Mat ResizeImage(const cv::Mat &img);
  void NeonMeanScale(const float *din, float *dout, int size);

 private:
  std::shared_ptr<PaddlePredictor> predictor;
-  std::vector<std::string> label_list;
+  //std::vector<std::string> label_list;
  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
  std::vector<float> std = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
  double scale = 0.00392157;
  float size = 224;
-  int topk = 5;
 };
 } // namespace PPShiTu
--- a/deploy/lite_shitu/include/utils.h
+++ b/deploy/lite_shitu/include/utils.h
@@ -16,7 +16,7 @@

 #include <algorithm>
 #include <ctime>
-#include <include/recognition.h>
+#include <include/feature_extractor.h>
 #include <memory>
 #include <numeric>
 #include <string>

--- a/deploy/lite_shitu/include/vector_search.h
+++ b/deploy/lite_shitu/include/vector_search.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef WIN32
+#define OS_PATH_SEP "\\"
+#else
+#define OS_PATH_SEP "/"
+#endif
+
+#include "json/json.h"
+#include <cstring>
+#include <faiss/Index.h>
+#include <faiss/index_io.h>
+#include <map>
+
+namespace PPShiTu {
+struct SearchResult {
+  std::vector<faiss::Index::idx_t> I;
+  std::vector<float> D;
+  int return_k;
+};
+
+class VectorSearch {
+public:
+  explicit VectorSearch(const Json::Value &config) {
+    // IndexProcess
+    this->index_dir = config["IndexProcess"]["index_dir"].as<std::string>();
+    this->return_k = config["IndexProcess"]["return_k"].as<int>();
+    this->score_thres = config["IndexProcess"]["score_thres"].as<float>();
+    this->max_query_number = config["Global"]["max_det_results"].as<int>() + 1;
+
+    LoadIdMap();
+    LoadIndexFile();
+    this->I.resize(this->return_k * this->max_query_number);
+    this->D.resize(this->return_k * this->max_query_number);
+    printf("faiss index load success!\n");
+  };
+
+  void LoadIdMap();
+
+  void LoadIndexFile();
+
+  const SearchResult &Search(float *feature, int query_number);
+
+  const std::string &GetLabel(faiss::Index::idx_t ind);
+
+  const float &GetThreshold() { return this->score_thres; }
+
+private:
+  std::string index_dir;
+  int return_k = 5;
+  float score_thres = 0.5;
+
+  std::map<long int, std::string> id_map;
+  faiss::Index *index;
+  int max_query_number = 6;
+  std::vector<float> D;
+  std::vector<faiss::Index::idx_t> I;
+  SearchResult sr;
+};
+}
--- a/deploy/lite_shitu/src/recognition.cc
+++ b/deploy/lite_shitu/src/recognition.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "include/recognition.h"
+#include "include/feature_extractor.h"

 namespace PPShiTu {
-std::vector<RESULT> Recognition::RunRecModel(const cv::Mat &img,
-                                             double &cost_time) {
-
+void FeatureExtract::RunRecModel(const cv::Mat &img,
+                                 double &cost_time,
+                                 std::vector<float> &feature) {
  // Read img
  cv::Mat resize_image = ResizeImage(img);

@@ -38,8 +38,7 @@ std::vector<RESULT> Recognition::RunRecModel(const cv::Mat &img,

  // Get output and post process
  std::unique_ptr<const Tensor> output_tensor(
-      std::move(this->predictor->GetOutput(1)));
-  auto *output_data = output_tensor->data<float>();
+      std::move(this->predictor->GetOutput(0)));  //only one output
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);
@@ -47,17 +46,27 @@ std::vector<RESULT> Recognition::RunRecModel(const cv::Mat &img,
              std::chrono::microseconds::period::num /
              std::chrono::microseconds::period::den;

+  //do postprocess
  int output_size = 1;
  for (auto dim : output_tensor->shape()) {
    output_size *= dim;
  }
+  feature.resize(output_size);
+  output_tensor->CopyToCpu(feature.data());

-  cv::Mat output_image;
-  auto results = PostProcess(output_data, output_size, output_image);
-  return results;
+  //postprocess include sqrt or binarize.
+  //PostProcess(feature);
+  return;
 }

-void Recognition::NeonMeanScale(const float *din, float *dout, int size) {
+// void FeatureExtract::PostProcess(std::vector<float> &feature){
+//     float feature_sqrt = std::sqrt(std::inner_product(
+//             feature.begin(), feature.end(), feature.begin(), 0.0f));
+//     for (int i = 0; i < feature.size(); ++i)
+//         feature[i] /= feature_sqrt;
+// }
+
+void FeatureExtract::NeonMeanScale(const float *din, float *dout, int size) {

  if (this->mean.size() != 3 || this->std.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
@@ -99,45 +108,9 @@ void Recognition::NeonMeanScale(const float *din, float *dout, int size) {
  }
 }

-cv::Mat Recognition::ResizeImage(const cv::Mat &img) {
+cv::Mat FeatureExtract::ResizeImage(const cv::Mat &img) {
  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(this->size, this->size));
  return resize_img;
 }
-std::vector<RESULT> Recognition::PostProcess(const float *output_data,
-                                             int output_size,
-                                             cv::Mat &output_image) {
-
-  int max_indices[this->topk];
-  double max_scores[this->topk];
-  for (int i = 0; i < this->topk; i++) {
-    max_indices[i] = 0;
-    max_scores[i] = 0;
-  }
-  for (int i = 0; i < output_size; i++) {
-    float score = output_data[i];
-    int index = i;
-    for (int j = 0; j < this->topk; j++) {
-      if (score > max_scores[j]) {
-        index += max_indices[j];
-        max_indices[j] = index - max_indices[j];
-        index -= max_indices[j];
-        score += max_scores[j];
-        max_scores[j] = score - max_scores[j];
-        score -= max_scores[j];
-      }
-    }
-  }
-
-  std::vector<RESULT> results(this->topk);
-  for (int i = 0; i < results.size(); i++) {
-    results[i].class_name = "Unknown";
-    if (max_indices[i] >= 0 && max_indices[i] < this->label_list.size()) {
-      results[i].class_name = this->label_list[max_indices[i]];
-    }
-    results[i].score = max_scores[i];
-    results[i].class_id = max_indices[i];
-  }
-  return results;
-}
 }
--- a/deploy/lite_shitu/src/main.cc
+++ b/deploy/lite_shitu/src/main.cc
@@ -24,9 +24,10 @@
 #include <vector>

 #include "include/config_parser.h"
+#include "include/feature_extractor.h"
 #include "include/object_detector.h"
 #include "include/preprocess_op.h"
-#include "include/recognition.h"
+#include "include/vector_search.h"
 #include "json/json.h"

 Json::Value RT_Config;
@@ -111,14 +112,18 @@ void DetPredictImage(const std::vector<cv::Mat> &batch_imgs,
  }
 }

-void PrintResult(const std::string &image_path,
-                 std::vector<PPShiTu::ObjectResult> &det_result) {
-  printf("%s:\n", image_path.c_str());
+void PrintResult(std::string &img_path,
+                 std::vector<PPShiTu::ObjectResult> &det_result,
+                 PPShiTu::VectorSearch &vector_search,
+                 PPShiTu::SearchResult &search_result) {
+  printf("%s:\n", img_path.c_str());
  for (int i = 0; i < det_result.size(); ++i) {
+    int t = i;
    printf("\tresult%d: bbox[%d, %d, %d, %d], score: %f, label: %s\n", i,
-           det_result[i].rect[0], det_result[i].rect[1], det_result[i].rect[2],
-           det_result[i].rect[3], det_result[i].rec_result[0].score,
-           det_result[i].rec_result[0].class_name.c_str());
+           det_result[t].rect[0], det_result[t].rect[1], det_result[t].rect[2],
+           det_result[t].rect[3], det_result[t].confidence,
+           vector_search.GetLabel(search_result.I[search_result.return_k * t])
+               .c_str());
  }
 }

@@ -159,11 +164,16 @@ int main(int argc, char **argv) {
      RT_Config["Global"]["cpu_num_threads"].as<int>(),
      RT_Config["Global"]["batch_size"].as<int>());
  // create rec model
-  PPShiTu::Recognition rec(RT_Config);
+  PPShiTu::FeatureExtract rec(RT_Config);
+  PPShiTu::VectorSearch searcher(RT_Config);
  // Do inference on input image

  std::vector<PPShiTu::ObjectResult> det_result;
  std::vector<cv::Mat> batch_imgs;
+
+  // for vector search
+  std::vector<float> feature;
+  std::vector<float> features;
  double rec_time;
  if (!RT_Config["Global"]["infer_imgs"].as<std::string>().empty() ||
      !img_dir.empty()) {
@@ -178,8 +188,7 @@ int main(int argc, char **argv) {
        return -1;
      }
    } else {
-      cv::glob(img_dir,
-               cv_all_img_paths);
+      cv::glob(img_dir, cv_all_img_paths);
      for (const auto &img_path : cv_all_img_paths) {
        all_img_paths.push_back(img_path);
      }
@@ -199,24 +208,25 @@ int main(int argc, char **argv) {
          RT_Config["Global"]["max_det_results"].as<int>(), false, &det);

      // add the whole image for recognition to improve recall
-      PPShiTu::ObjectResult result_whole_img = {
-          {0, 0, srcimg.cols, srcimg.rows}, 0, 1.0};
-      det_result.push_back(result_whole_img);
+//      PPShiTu::ObjectResult result_whole_img = {
+//          {0, 0, srcimg.cols, srcimg.rows}, 0, 1.0};
+//      det_result.push_back(result_whole_img);

      // get rec result
+      PPShiTu::SearchResult search_result;
      for (int j = 0; j < det_result.size(); ++j) {
        int w = det_result[j].rect[2] - det_result[j].rect[0];
        int h = det_result[j].rect[3] - det_result[j].rect[1];
        cv::Rect rect(det_result[j].rect[0], det_result[j].rect[1], w, h);
        cv::Mat crop_img = srcimg(rect);
-        std::vector<PPShiTu::RESULT> result =
-            rec.RunRecModel(crop_img, rec_time);
-        det_result[j].rec_result.assign(result.begin(), result.end());
+        rec.RunRecModel(crop_img, rec_time, feature);
+        features.insert(features.end(), feature.begin(), feature.end());
      }
-      // rec nms
-      PPShiTu::nms(det_result,
-                   RT_Config["Global"]["rec_nms_thresold"].as<float>(), true);
-      PrintResult(img_path, det_result);
+
+      // do vectore search
+      search_result = searcher.Search(features.data(), det_result.size());
+      PrintResult(img_path, det_result, searcher, search_result);
+
      batch_imgs.clear();
      det_result.clear();
    }

--- a/deploy/lite_shitu/src/vector_search.cc
+++ b/deploy/lite_shitu/src/vector_search.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/vector_search.h"
+#include <cstdio>
+#include <faiss/index_io.h>
+#include <fstream>
+#include <iostream>
+#include <regex>
+
+namespace PPShiTu {
+// load the vector.index
+void VectorSearch::LoadIndexFile() {
+  std::string file_path = this->index_dir + OS_PATH_SEP + "vector.index";
+  const char *fname = file_path.c_str();
+  this->index = faiss::read_index(fname, 0);
+}
+
+// load id_map.txt
+void VectorSearch::LoadIdMap() {
+  std::string file_path = this->index_dir + OS_PATH_SEP + "id_map.txt";
+  std::ifstream in(file_path);
+  std::string line;
+  std::vector<std::string> m_vec;
+  if (in) {
+    while (getline(in, line)) {
+      std::regex ws_re("\\s+");
+      std::vector<std::string> v(
+          std::sregex_token_iterator(line.begin(), line.end(), ws_re, -1),
+          std::sregex_token_iterator());
+      if (v.size() != 2) {
+        std::cout << "The number of element for each line in : " << file_path
+                  << "must be 2, exit the program..." << std::endl;
+        exit(1);
+      } else
+        this->id_map.insert(std::pair<long int, std::string>(
+            std::stol(v[0], nullptr, 10), v[1]));
+    }
+  }
+}
+
+// doing search
+const SearchResult &VectorSearch::Search(float *feature, int query_number) {
+  this->D.resize(this->return_k * query_number);
+  this->I.resize(this->return_k * query_number);
+  this->index->search(query_number, feature, return_k, D.data(), I.data());
+  this->sr.return_k = this->return_k;
+  this->sr.D = this->D;
+  this->sr.I = this->I;
+  return this->sr;
+}
+
+const std::string &VectorSearch::GetLabel(faiss::Index::idx_t ind) {
+  return this->id_map.at(ind);
+}
+}
\ No newline at end of file
--- a/deploy/lite_shitu/transform_id_map.py
+++ b/deploy/lite_shitu/transform_id_map.py
+import argparse
+import os
+import pickle
+
+import yaml
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    with open(args.config) as fd:
+        config = yaml.load(fd.read(), yaml.FullLoader)
+    index_dir = ""
+    try:
+        index_dir = config["IndexProcess"]["index_dir"]
+    except Exception as e:
+        print("The IndexProcess.index_dir in config_file dose not exist")
+        exit(1)
+    id_map_path = os.path.join(index_dir, "id_map.pkl")
+    assert os.path.exists(
+        id_map_path), "The id_map file dose not exist: {}".format(id_map_path)
+
+    with open(id_map_path, "rb") as fd:
+        ids = pickle.load(fd)
+    with open(os.path.join(index_dir, "id_map.txt"), "w") as fd:
+        for k, v in ids.items():
+            v = v.split("\t")[1]
+            fd.write(str(k) + " " + v + "\n")
+    print('Transform id_map sucess')
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/en/algorithm_introduction/image_classification_en.md
+++ b/docs/en/algorithm_introduction/image_classification_en.md
@@ -55,7 +55,7 @@ After the data is settled, the model often determines the upper limit of the fin
 <a name="2.3"></a>
 ### 2.3 Train the Model

-After preparing the data and model, you can start training the model and updating the parameters of the model. After many iterations, a trained model can finally be obtained for image classification tasks. The training process of image classification requires a lot of experience and involves the setting of many hyperparameters. PaddleClas provides a series of [training tuning methods](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/en/models/Tricks_en.md), which can help you quickly obtain a high-precision model.
+After preparing the data and model, you can start training the model and updating the parameters of the model. After many iterations, a trained model can finally be obtained for image classification tasks. The training process of image classification requires a lot of experience and involves the setting of many hyperparameters. PaddleClas provides a series of [training tuning methods](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/en/models_training/train_strategy_en.md), which can help you quickly obtain a high-precision model.


 <a name="2.4"></a>

--- a/docs/en/faq_series/faq_2020_s1_en.md
+++ b/docs/en/faq_series/faq_2020_s1_en.md
@@ -108,7 +108,7 @@ PaddleClas strictly follows the resolution used by the authors of the paper. Sin

 **A**:

-There are many ssld pre-training models available in PaddleClas, which obtain better pre-training weights by semi-supervised knowledge distillation, so that the accuracy can be improved by replacing the ssld pre-training models with higher accuracy in transfer tasks or downstream vision tasks without replacing the structure files. For example, in PaddleSeg, [HRNet](../models/HRNet_en.md) , with the weight of the ssld pre-training model, achieves much better accuracy than other same models in the industry; In PaddleDetection, [PP- YOLO](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/configs/ppyolo/README_cn.md)with ssld pre-training weights has further improvement in the already high baseline. The transfer of classification with ssld pre-training weights also yields impressive results, and the benefits of knowledge distillation for the transfer of classification task is detailed in  [SSLD Distillation Strategy](../advanced_tutorials/knowledge_distillation_en.md)  
+There are many ssld pre-training models available in PaddleClas, which obtain better pre-training weights by semi-supervised knowledge distillation, so that the accuracy can be improved by replacing the ssld pre-training models with higher accuracy in transfer tasks or downstream vision tasks without replacing the structure files. For example, in PaddleSeg, [HRNet](../models/HRNet_en.md) , with the weight of the ssld pre-training model, achieves much better accuracy than other same models in the industry; In PaddleDetection, [PP- YOLO](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/configs/ppyolo/README_cn.md)with ssld pre-training weights has further improvement in the already high baseline. The transfer of classification with ssld pre-training weights also yields impressive results, and the benefits of knowledge distillation for the transfer of classification task is detailed in  [SSLD Distillation Strategy](../advanced_tutorials/distillation/distillation_en.md)  

 <a name="3"></a>

@@ -143,7 +143,7 @@ When adopting multiple models for inference, it is recommended to first export t

 **A**：

- You can adopt auto-mixed precision training, which can gain a significantly faster speed with almost zero precision loss. Take ResNet50 as an example, the configuration file of auto-mixed precision training in PaddleClas can be found at: [ResNet50_fp16.yml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml). The main step is to add the following lines to the standard configuration file.
+- You can adopt auto-mixed precision training, which can gain a significantly faster speed with almost zero precision loss. Take ResNet50 as an example, the configuration file of auto-mixed precision training in PaddleClas can be found at: [ResNet50_amp_O1.yml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml). The main step is to add the following lines to the standard configuration file.

 ```
 # mixed precision training
@@ -351,7 +351,7 @@ At this stage, it has become a common practice in the image recognition field to
 **A**: If the existing strategy cannot further improve the accuracy of the model, it means that the model has almost reached saturation with the existing dataset and strategy, and two methods are provided here.

 - Mining relevant data: Use the model trained on the existing dataset to make predictions on the relevant data, label the data with higher confidence and add it to the training set for further training. Repeat the steps above to further improve the accuracy of the model.
- Knowledge distillation: You can use a larger model to train a teacher model with higher accuracy on the dataset, and then adopt the teacher model to teach a Student model, where the Student model is the target model. PaddleClas provides Baidu's own SSLD knowledge distillation scheme, which can steadily improve by more than 3% even on such a challenging classification task as ImageNet-1k. For the chapter on SSLD knowledge distillation, please refer to [**SSLD Knowledge Distillation**](../advanced_tutorials/knowledge_distillation_en.md).
+- Knowledge distillation: You can use a larger model to train a teacher model with higher accuracy on the dataset, and then adopt the teacher model to teach a Student model, where the Student model is the target model. PaddleClas provides Baidu's own SSLD knowledge distillation scheme, which can steadily improve by more than 3% even on such a challenging classification task as ImageNet-1k. For the chapter on SSLD knowledge distillation, please refer to [**SSLD Knowledge Distillation**](../advanced_tutorials/distillation/distillation_en.md).

 <a name="6"></a>


--- a/docs/en/faq_series/faq_2021_s2_en.md
+++ b/docs/en/faq_series/faq_2021_s2_en.md
@@ -248,7 +248,7 @@ PaddleClas saves/updates the following three types of models during training.

 #### Q2.4.2: How can recognition models be fine-tuned to train on the basis of pre-trained models?

-**A**：The fine-tuning training of the recognition model is similar to that of the classification model. The recognition model can be loaded with a pre-trained model of the product, and the training process can be found in [recognition model training](../../models_training/recognition_en.md), and we will continue to refine the documentation.
+**A**：The fine-tuning training of the recognition model is similar to that of the classification model. The recognition model can be loaded with a pre-trained model of the product, and the training process can be found in [recognition model training](../models_training/recognition_en.md), and we will continue to refine the documentation.

 #### Q2.4.3: Why does it fail to run all mini-batches in each epoch when training metric learning?

@@ -353,4 +353,4 @@ pip install paddle2onnx
  - `InputSpec()` function is used to describe the signature information of the model input, including the `shape`, `type` and `name` of the input data (can be omitted).
  - The `paddle.onnx.export()` function needs to specify the model grouping object `net`, the save path of the exported model `save_path`, and the description of the model's input data `input_spec`.

-  Note that the `paddlepaddle`  `2.0.0` or above should be adopted.See [paddle.onnx.export](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/onnx/) for more details on the parameters of the  `paddle.onnx.export()` function.
+  Note that the `paddlepaddle`  `2.0.0` or above should be adopted.See [paddle.onnx.export](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/onnx/export_en.html) for more details on the parameters of the  `paddle.onnx.export()` function.
--- a/docs/en/faq_series/faq_selected_30_en.md
+++ b/docs/en/faq_series/faq_selected_30_en.md
@@ -27,8 +27,8 @@

 > >

- Q: 怎样根据自己的任务选择合适的模型进行训练？How to choose the right training model?
- A: If you want to deploy on the server with a high requirement for accuracy but not model storage size or prediction speed, then it is recommended to use ResNet_vd, Res2Net_vd, DenseNet, Xception, etc., which are suitable for server-side models. If you want to deploy on the mobile side, then it is recommended to use MobileNetV3 and GhostNet. Meanwhile, we suggest you refer to the speed-accuracy metrics chart in [Model Library](../models/models_intro_en.md) when choosing models.
+- Q: How to choose the right training model?
+- A: If you want to deploy on the server with a high requirement for accuracy but not model storage size or prediction speed, then it is recommended to use ResNet_vd, Res2Net_vd, DenseNet, Xception, etc., which are suitable for server-side models. If you want to deploy on the mobile side, then it is recommended to use MobileNetV3 and GhostNet. Meanwhile, we suggest you refer to the speed-accuracy metrics chart in [Model Library](../algorithm_introduction/ImageNet_models_en.md) when choosing models.

 > >

@@ -280,7 +280,7 @@ Loss:
 > >

 - Q: How to train with Automatic Mixed Precision (AMP) during training?
- A: You can refer to [ResNet50_fp16.yaml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml). Specifically, if you want your configuration file to support automatic mixed precision during model training, you can add the following information to the file.
+- A: You can refer to [ResNet50_amp_O1.yaml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml). Specifically, if you want your configuration file to support automatic mixed precision during model training, you can add the following information to the file.

 ```
 # mixed precision training

--- a/docs/en/inference_deployment/paddle_hub_serving_deploy_en.md
+++ b/docs/en/inference_deployment/paddle_hub_serving_deploy_en.md
@@ -48,7 +48,7 @@ Before installing the service module, you need to prepare the inference model an
 **Notice**:
 * The model file path can be viewed and modified in `PaddleClas/deploy/hubserving/clas/params.py`.
 * It should be noted that the prefix of model structure file and model parameters file must be `inference`.
-* More models provided by PaddleClas can be obtained from the [model library](../models/models_intro_en.md). You can also use models trained by yourself.
+* More models provided by PaddleClas can be obtained from the [model library](../algorithm_introduction/ImageNet_models_en.md). You can also use models trained by yourself.

 <a name="4"></a>
 ## 4. Install Service Module

--- a/docs/en/inference_deployment/paddle_lite_deploy_en.md
+++ b/docs/en/inference_deployment/paddle_lite_deploy_en.md
@@ -4,7 +4,7 @@ This tutorial will introduce how to use [Paddle-Lite](https://github.com/PaddleP

 Paddle-Lite is a lightweight inference engine for PaddlePaddle. It provides efficient inference capabilities for mobile phones and IoTs,  and extensively integrates cross-platform hardware to provide lightweight deployment solutions for mobile-side deployment issues.

-If you only want to test speed, please refer to [The tutorial of Paddle-Lite mobile-side benchmark test](../extension/paddle_mobile_inference_en.md).
+If you only want to test speed, please refer to [The tutorial of Paddle-Lite mobile-side benchmark test](../others/paddle_mobile_inference_en.md).

 ---

@@ -47,7 +47,7 @@ For the detailed compilation directions of different development environments, p

 1. If you download the inference library from [Paddle-Lite official document](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc), please choose `with_extra=ON` , `with_cv=ON` .

-2. It is recommended to build inference library using [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) develop branch if you want to deploy the [quantitative](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/slim/quantization/README_en.md) model to mobile phones. Please refer to the [link](https://paddle-lite.readthedocs.io/zh/latest/user_guides/Compile/Android.html#id2) for more detailed information about compiling.
+2. It is recommended to build inference library using [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) develop branch if you want to deploy the [quantitative](https://github.com/PaddlePaddle/PaddleOCR/blob/develop/deploy/slim/quantization/README_en.md) model to mobile phones. Please refer to the [link](https://paddle-lite.readthedocs.io/) for more detailed information about compiling.


 The structure of the inference library is as follows:

--- a/docs/en/models/PP-LCNet_en.md
+++ b/docs/en/models/PP-LCNet_en.md
@@ -107,7 +107,7 @@ For image classification, ImageNet dataset is adopted. Compared with the current
 | PPLCNet_x1_0_ssld | 3.0 | 161 | 74.39 | 92.09 | 2.46 |
 | PPLCNet_x2_5_ssld | 9.0 | 906 | 80.82 | 95.33 | 5.39 |

-where `_ssld` represents the model after using `SSLD distillation`. For details about `SSLD distillation`, see [SSLD distillation](../advanced_tutorials/knowledge_distillation_en.md).
+where `_ssld` represents the model after using `SSLD distillation`. For details about `SSLD distillation`, see [SSLD distillation](../advanced_tutorials/distillation/distillation_en.md).

 Performance comparison with other lightweight networks:

@@ -190,7 +190,7 @@ Rather than holding on to perfect FLOPs and Params as academics do, PP-LCNet foc
 Reference to cite when you use PP-LCNet in a paper:
 ```
 @misc{cui2021pplcnet,
-      title={PP-LCNet: A Lightweight CPU Convolutional Neural Network}, 
+      title={PP-LCNet: A Lightweight CPU Convolutional Neural Network},
      author={Cheng Cui and Tingquan Gao and Shengyu Wei and Yuning Du and Ruoyu Guo and Shuilong Dong and Bin Lu and Ying Zhou and Xueying Lv and Qiwen Liu and Xiaoguang Hu and Dianhai Yu and Yanjun Ma},
      year={2021},
      eprint={2109.15099},

--- a/docs/en/models_training/classification_en.md
+++ b/docs/en/models_training/classification_en.md
@@ -217,7 +217,7 @@ Some of the configurable evaluation parameters are described as follows:

 **Note：** When loading the model to be evaluated, you only need to specify the path of the model file stead of the suffix. PaddleClas will automatically add the `.pdparams` suffix, such as [3.1.3 Resume Training](#3.1.3).

-When loading the model to be evaluated, you only need to specify the path of the model file stead of the suffix. PaddleClas will automatically add the `.pdparams` suffix, such as [3.1.3 Resume Training](https://github.com/PaddlePaddle/PaddleClas/blob/ develop/docs/zh_CN/models_training/classification.md#3.1.3).
+When loading the model to be evaluated, you only need to specify the path of the model file stead of the suffix. PaddleClas will automatically add the `.pdparams` suffix, such as [3.1.3 Resume Training](../models_training/classification_en.md#3.1.3).

 <a name="3.2"></a>


--- a/docs/en/others/feature_visiualization_en.md
+++ b/docs/en/others/feature_visiualization_en.md
@@ -27,7 +27,7 @@ The first step is to select the model to be studied, here we choose ResNet50. Co
 wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_pretrained.pdparams
 ```

-For other pre-training models and codes of network structure, please download [model library](../../../ppcls/arch/backbone/) and [pre-training models](../models/models_intro_en.md).
+For other pre-training models and codes of network structure, please download [model library](../../../ppcls/arch/backbone/) and [pre-training models](../algorithm_introduction/ImageNet_models_en.md).

 <a name='3'></a>


--- a/docs/en/quick_start/quick_start_multilabel_classification_en.md
+++ b/docs/en/quick_start/quick_start_multilabel_classification_en.md
 # Quick Start of Multi-label Classification

-Experience the training, evaluation, and prediction of multi-label classification based on the [NUS-WIDE-SCENE](https://lms.comp.nus.edu.sg/wp-content/uploads/2019/research/nuswide/NUS-WIDE.html) dataset, which is a subset of the NUS-WIDE dataset. Please first install PaddlePaddle and PaddleClas, see [Paddle Installation](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/installation) and [PaddleClas installation](https://github.com/PaddlePaddle/PaddleClas/blob/develop/docs/zh_CN/installation/install_ paddleclas.md) for more details.
+Experience the training, evaluation, and prediction of multi-label classification based on the [NUS-WIDE-SCENE](https://lms.comp.nus.edu.sg/wp-content/uploads/2019/research/nuswide/NUS-WIDE.html) dataset, which is a subset of the NUS-WIDE dataset. Please first install PaddlePaddle and PaddleClas, see [Paddle Installation](../installation/install_paddle_en.md) and [PaddleClas installation](../installation/install_paddleclas_en.md) for more details.

 ## Catalogue


--- a/docs/zh_CN/advanced_tutorials/shitu_deploy.md
+++ b/docs/zh_CN/advanced_tutorials/shitu_deploy.md
@@ -80,7 +80,7 @@
 因为要对模型进行训练，所以收集自己的数据集。数据准备及相应格式请参考：[特征提取文档](../image_recognition_pipeline/feature_extraction.md)中 `4.1数据准备`部分、[识别数据集说明](../data_preparation/recognition_dataset.md)。值得注意的是，此部分需要准备大量的数据，以保证识别模型效果。训练配置文件参考：[通用识别模型配置文件](../../../ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml)，训练方法参考：[识别模型训练](../models_training/recognition.md)

 - 数据增强：根据实际情况选择不同数据增强方法。如：实际应用中数据遮挡比较严重，建议添加`RandomErasing`增强方法。详见[数据增强文档](./DataAugmentation.md)
- 换不同的`backbone`，一般来说，越大的模型，特征提取能力更强。不同`backbone`详见[模型介绍](../models/models_intro.md)
+- 换不同的`backbone`，一般来说，越大的模型，特征提取能力更强。不同`backbone`详见[模型介绍](../algorithm_introduction/ImageNet_models.md)
 - 选择不同的`Metric Learning`方法。不同的`Metric Learning`方法，对不同的数据集效果可能不太一样，建议尝试其他`Loss`,详见[Metric Learning](../algorithm_introduction/metric_learning.md)
 - 采用蒸馏方法，对小模型进行模型能力提升，详见[模型蒸馏](../algorithm_introduction/knowledge_distillation.md)
 - 增补数据集。针对错误样本，添加badcase数据

--- a/docs/zh_CN/faq_series/faq_2020_s1.md
+++ b/docs/zh_CN/faq_series/faq_2020_s1.md
@@ -122,14 +122,15 @@ ResNet 系列模型中，相比于其他模型，ResNet_vd 模型在预测速度

 **A**：

-* 可以使用自动混合精度进行训练，这在精度几乎无损的情况下，可以有比较明显的速度收益，以 ResNet50 为例，PaddleClas 中使用自动混合精度训练的配置文件可以参考：[ResNet50_fp16.yml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml)，主要就是需要在标准的配置文件中添加以下几行
+* 可以使用自动混合精度进行训练，这在精度几乎无损的情况下，可以有比较明显的速度收益，以 ResNet50 为例，PaddleClas 中使用自动混合精度训练的配置文件可以参考：[ResNet50_amp_O1.yml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml)，主要就是需要在标准的配置文件中添加以下几行

-```
+```yaml
 # mixed precision training
 AMP:
  scale_loss: 128.0
  use_dynamic_loss_scaling: True
-  use_pure_fp16: &use_pure_fp16 True
+  # O1: mixed fp16
+  level: O1
 ```

 * 可以开启 dali，将数据预处理方法放在 GPU 上运行，在模型比较小时（reader 耗时占比更高一些），开启 dali 会带来比较明显的训练速度收益，在训练的时候，添加 `-o Global.use_dali=True` 即可使用 dali 进行训练，更多关于 dali 安装与介绍可以参考：[dali 安装教程](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html#nightly-builds)。

--- a/docs/zh_CN/faq_series/faq_selected_30.md
+++ b/docs/zh_CN/faq_series/faq_selected_30.md
@@ -31,7 +31,8 @@

 >>
 * Q: 怎样根据自己的任务选择合适的模型进行训练？
-* A: 如果希望在服务器部署，或者希望精度尽可能地高，对模型存储大小或者预测速度的要求不是很高，那么推荐使用 ResNet_vd、Res2Net_vd、DenseNet、Xception 等适合于服务器端的系列模型；如果希望在移动端侧部署，则推荐使用 MobileNetV3、GhostNet 等适合于移动端的系列模型。同时，我们推荐在选择模型的时候可以参考[模型库](../models/models_intro.md)中的速度-精度指标图。
+* A: 如果希望在服务器部署，或者希望精度尽可能地高，对模型存储大小或者预测速度的要求不是很高，那么推荐使用 ResNet_vd、Res2Net_vd、DenseNet、Xception 等适合于服务器端的系列模型；如果希望在移动端侧部署，则推荐使用 MobileNetV3、GhostNet
+    等适合于移动端的系列模型。同时，我们推荐在选择模型的时候可以参考[模型库](../algorithm_introduction/ImageNet_models.md)中的速度-精度指标图。

 >>
 * Q: 如何进行参数初始化，什么样的初始化可以加快模型收敛？
@@ -232,11 +233,13 @@ Loss:
 * A: 如果希望使用 TensorRT 进行模型预测推理的话，需要安装或是自己编译带 TensorRT 的 PaddlePaddle，Linux、Windows、macOS 系统的用户下载安装可以参考参考[下载预测库](https://paddleinference.paddlepaddle.org.cn/user_guides/download_lib.html)，如果没有符合您所需要的版本，则需要本地编译安装，编译方法可以参考[源码编译](https://paddleinference.paddlepaddle.org.cn/user_guides/source_compile.html)。
 >>
 * Q: 怎样在训练的时候使用自动混合精度(Automatic Mixed Precision, AMP)训练呢？
-* A: 可以参考 [ResNet50_fp16.yaml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml) 这个配置文件；具体地，如果希望自己的配置文件在模型训练的时候也支持自动混合精度，可以在配置文件中添加下面的配置信息。
-```
+* A: 可以参考 [ResNet50_amp_O1.yaml](../../../ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml) 这个配置文件；具体地，如果希望自己的配置文件在模型训练的时候也支持自动混合精度，可以在配置文件中添加下面的配置信息。
+
+```yaml
 # mixed precision training
 AMP:
  scale_loss: 128.0
  use_dynamic_loss_scaling: True
-  use_pure_fp16: &use_pure_fp16 True
+  # O1: mixed fp16
+  level: O1
 ```
--- a/docs/zh_CN/inference_deployment/paddle_hub_serving_deploy.md
+++ b/docs/zh_CN/inference_deployment/paddle_hub_serving_deploy.md
@@ -15,8 +15,8 @@ PaddleClas 支持通过 PaddleHub 快速进行服务化部署。目前支持图
    - [5.2 配置文件启动](#5.2)
 - [6. 发送预测请求](#6)
 - [7. 自定义修改服务模块](#7)
-    
-   
+
+
 <a name="1"></a>
 ## 1. 简介

@@ -55,7 +55,7 @@ pip3 install paddlehub==2.1.0 --upgrade -i https://pypi.tuna.tsinghua.edu.cn/sim
  ```
 需要注意，
  * 模型文件（包括 `.pdmodel` 与 `.pdiparams`）名称必须为 `inference`。
-  * 我们也提供了大量基于 ImageNet-1k 数据集的预训练模型，模型列表及下载地址详见[模型库概览](../models/models_intro.md)，也可以使用自己训练转换好的模型。
+  * 我们也提供了大量基于 ImageNet-1k 数据集的预训练模型，模型列表及下载地址详见[模型库概览](../algorithm_introduction/ImageNet_models.md)，也可以使用自己训练转换好的模型。


 <a name="4"></a>

--- a/docs/zh_CN/others/feature_visiualization.md
+++ b/docs/zh_CN/others/feature_visiualization.md
@@ -23,7 +23,7 @@
 wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_pretrained.pdparams
 ```

-其他模型网络结构代码及预训练模型请自行下载：[模型库](../../../ppcls/arch/backbone/)，[预训练模型](../models/models_intro.md)。
+其他模型网络结构代码及预训练模型请自行下载：[模型库](../../../ppcls/arch/backbone/)，[预训练模型](../algorithm_introduction/ImageNet_models.md)。

 <a name='3'></a>


--- a/docs/zh_CN/others/update_history.md
+++ b/docs/zh_CN/others/update_history.md
@@ -3,12 +3,8 @@
 - 2021.11.1 发布[PP-ShiTu技术报告](https://arxiv.org/pdf/2111.00775.pdf)，新增饮料识别demo。
 - 2021.10.23 发布轻量级图像识别系统PP-ShiTu，CPU上0.2s即可完成在10w+库的图像识别。[点击这里](../quick_start/quick_start_recognition.md)立即体验。
 - 2021.09.17 发布PP-LCNet系列超轻量骨干网络模型, 在Intel CPU上，单张图像预测速度约5ms，ImageNet-1K数据集上Top1识别准确率达到80.82%，超越ResNet152的模型效果。PP-LCNet的介绍可以参考[论文](https://arxiv.org/pdf/2109.15099.pdf), 或者[PP-LCNet模型介绍](../models/PP-LCNet.md)，相关指标和预训练权重可以从 [这里](../algorithm_introduction/ImageNet_models.md)下载。
- 2021.08.11 更新7个[FAQ](../faq_series/faq_2021_s2.md)。
- 2021.06.29 添加Swin-transformer系列模型，ImageNet1k数据集上Top1 acc最高精度可达87.2%；支持训练预测评估与whl包部署，预训练模型可以从[这里](../models/models_intro.md)下载。
- 2021.06.22,23,24 PaddleClas官方研发团队带来技术深入解读三日直播课。课程回放：[https://aistudio.baidu.com/aistudio/course/introduce/24519](https://aistudio.baidu.com/aistudio/course/introduce/24519)
- 2021.06.16 PaddleClas v2.2版本升级，集成Metric learning，向量检索等组件。新增商品识别、动漫人物识别、车辆识别和logo识别等4个图像识别应用。新增LeViT、Twins、TNT、DLA、HarDNet、RedNet系列30个预训练模型。
 - 2021.08.11 更新 7 个[FAQ](../faq_series/faq_2021_s2.md)。
- 2021.06.29 添加 Swin-transformer 系列模型，ImageNet1k 数据集上 Top1 acc 最高精度可达 87.2%；支持训练预测评估与 whl 包部署，预训练模型可以从[这里](../models/models_intro.md)下载。
+- 2021.06.29 添加 Swin-transformer 系列模型，ImageNet1k 数据集上 Top1 acc 最高精度可达 87.2%；支持训练预测评估与 whl 包部署，预训练模型可以从[这里](../algorithm_introduction/ImageNet_models.md)下载。
 - 2021.06.22,23,24 PaddleClas 官方研发团队带来技术深入解读三日直播课。课程回放：[https://aistudio.baidu.com/aistudio/course/introduce/24519](https://aistudio.baidu.com/aistudio/course/introduce/24519)
 - 2021.06.16 PaddleClas v2.2 版本升级，集成 Metric learning，向量检索等组件。新增商品识别、动漫人物识别、车辆识别和 logo 识别等 4 个图像识别应用。新增 LeViT、Twins、TNT、DLA、HarDNet、RedNet 系列 30 个预训练模型。
 - 2021.04.15

--- a/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
@@ -10,9 +10,8 @@ Global:
  epochs: 120
  print_batch_step: 10
  use_visualdl: False
-  image_channel: &image_channel 4
  # used for static mode and model export
-  image_shape: [*image_channel, 224, 224]
+  image_shape: [3, 224, 224]
  save_inference_dir: ./inference
  # training model under @to_static
  to_static: False
@@ -29,8 +28,6 @@ AMP:
 Arch:
  name: ResNet50
  class_num: 1000
-  input_image_channel: *image_channel
-  data_format: "NHWC"
 
 # loss function config for traing/eval process
 Loss:
@@ -76,7 +73,6 @@ DataLoader:
            mean: [0.485, 0.456, 0.406]
            std: [0.229, 0.224, 0.225]
            order: ''
-            channel_num: *image_channel

    sampler:
      name: DistributedBatchSampler
@@ -105,7 +101,6 @@ DataLoader:
            mean: [0.485, 0.456, 0.406]
            std: [0.229, 0.224, 0.225]
            order: ''
-            channel_num: *image_channel
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
@@ -131,7 +126,6 @@ Infer:
        mean: [0.485, 0.456, 0.406]
        std: [0.229, 0.224, 0.225]
        order: ''
-        channel_num: *image_channel
    - ToCHWImage:
  PostProcess:
    name: Topk

--- a/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
--- a/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -171,7 +171,9 @@ class Engine(object):
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
-                    if hasattr(self.train_dataloader, "collate_fn"):
+                    if hasattr(
+                            self.train_dataloader, "collate_fn"
+                    ) and self.train_dataloader.collate_fn is not None:
                        for m_idx, m in enumerate(metric_config):
                            if "TopkAcc" in m:
                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."

--- a/test_tipc/benchmark_train.sh
+++ b/test_tipc/benchmark_train.sh
@@ -65,7 +65,22 @@ FILENAME=$new_filename
 # MODE must be one of ['benchmark_train']
 MODE=$2
 PARAMS=$3
-# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamic_bs8_null_DP_N1C1
+REST_ARGS=$4
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train to_static
+# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt  benchmark_train dynamic_bs8_null_DP_N1C1 to_static
+
+to_static="d2sF"
+# parse "to_static" options and modify trainer into "to_static_trainer"
+if [ $REST_ARGS = "to_static" ] || [ $PARAMS = "to_static" ] ;then
+   to_static="d2sT"
+   sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
+   # clear PARAM contents
+   if [ $PARAMS = "to_static" ] ;then
+    PARAMS=""
+   fi
+fi
+
 IFS=$'\n'
 # parser params from train_benchmark.txt
 sed -i 's/ -o DataLoader.Train.sampler.shuffle=False//g' $FILENAME
@@ -142,7 +157,6 @@ else
    batch_size=${params_list[1]}
    batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
    precision=${params_list[2]}
-    # run_process_type=${params_list[3]}
    run_mode=${params_list[3]}
    device_num=${params_list[4]}
    IFS=";"
@@ -167,10 +181,9 @@ for batch_size in ${batch_size_list[*]}; do
            gpu_id=$(set_gpu_id $device_num)

            if [ ${#gpu_id} -le 1 ];then
-                run_process_type="SingleP"
                log_path="$SAVE_LOG/profiling_log"
                mkdir -p $log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}_profiling"
                func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id 
                # set profile_option params
                tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
@@ -186,8 +199,8 @@ for batch_size in ${batch_size_list[*]}; do
                speed_log_path="$SAVE_LOG/index"
                mkdir -p $log_path
                mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}_speed"
                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
                echo $cmd
@@ -198,13 +211,12 @@ for batch_size in ${batch_size_list[*]}; do
                eval "cat ${log_path}/${log_name}"

                # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
                        --model_name ${_model_name} \
                        --base_batch_size ${batch_size} \
                        --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                        --fp_item ${precision} \
                        --keyword ips: \
                        --skip_steps 2 \
@@ -218,13 +230,12 @@ for batch_size in ${batch_size_list[*]}; do
            else
                IFS=";"
                unset_env=`unset CUDA_VISIBLE_DEVICES`
-                run_process_type="MultiP"
                log_path="$SAVE_LOG/train_log"
                speed_log_path="$SAVE_LOG/index"
                mkdir -p $log_path
                mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}_speed"
                func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id 
                func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
@@ -235,14 +246,13 @@ for batch_size in ${batch_size_list[*]}; do
                export model_run_time=$((${job_et}-${job_bt}))
                eval "cat ${log_path}/${log_name}"
                # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                
                cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                        --speed_log_file '${speed_log_path}/${speed_log_name}' \
                        --model_name ${_model_name} \
                        --base_batch_size ${batch_size} \
                        --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                        --fp_item ${precision} \
                        --keyword ips: \
                        --skip_steps 2 \

--- a/test_tipc/config/MobileNetV1/MobileNetV1_train_infer_python.txt
+++ b/test_tipc/config/MobileNetV1/MobileNetV1_train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:-o Global.to_static=True
 null:null
 ##
 ===========================eval_params=========================== 

--- a/test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt
+++ b/test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:-o Global.to_static=True
 null:null
 ##
 ===========================eval_params=========================== 

--- a/test_tipc/config/ResNet/ResNet152_train_infer_python.txt
+++ b/test_tipc/config/ResNet/ResNet152_train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet152.yaml -o Glo
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:-o Global.to_static=True
 null:null
 ##
 ===========================eval_params=========================== 

--- a/test_tipc/config/ResNet/ResNet50_train_infer_python.txt
+++ b/test_tipc/config/ResNet/ResNet50_train_infer_python.txt
@@ -17,7 +17,7 @@ norm_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50.yaml -o Glob
 pact_train:null
 fpgm_train:null
 distill_train:null
-null:null
+to_static_train:-o Global.to_static=True
 null:null
 ##
 ===========================eval_params=========================== 

--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_amp_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_amp_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_fp32_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_fp32_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_pure_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs128_pure_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_amp_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_amp_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh
 model_item=ResNet50
 bs_item=256
 fp_item=fp32
-run_process_type=SingleP
 run_mode=DP
 device_num=N1C1
 max_epochs=1
@@ -10,8 +9,8 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
 # run profiling
 sleep 10;
 export PROFILING=true
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_pure_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_pure_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_amp_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_amp_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh
 model_item=ResNet50
 bs_item=64
 fp_item=fp32
-run_process_type=SingleP
 run_mode=DP
 device_num=N1C1
 max_epochs=1
@@ -10,8 +9,8 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
 # run profiling
 sleep 10;
 export PROFILING=true
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
--- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_pure_fp16_SingleP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_pure_fp16_SingleP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_fp32_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs128_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh
 model_item=ResNet50
 bs_item=256
 fp_item=fp32
-run_process_type=MultiP
 run_mode=DP
 device_num=N1C8
 max_epochs=8
@@ -10,4 +9,4 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh
 model_item=ResNet50
 bs_item=64
 fp_item=fp32
-run_process_type=MultiP
 run_mode=DP
 device_num=N1C8
 max_epochs=8
@@ -10,4 +9,4 @@ num_workers=8
 # get data
 bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
 # run
-bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
+bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
--- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_fp32_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs128_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_fp32_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs256_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_amp_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_amp_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_fp32_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_fp32_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_pure_fp16_MultiP_DP.sh
+++ b/test_tipc/static/ResNet50/N4C32/ResNet50_bs64_pure_fp16_MultiP_DP.sh
--- a/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh
+++ b/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh
 #!/usr/bin/env bash
 # Test training benchmark for a model.
-# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
+# Usage：bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
 function _set_params(){
    model_item=${1:-"model_item"}   # (必选) 模型 item
    base_batch_size=${2:-"2"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
    fp_item=${3:-"fp32"}            # (必选) fp32|fp16
-    run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
-    run_mode=${5:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
-    device_num=${6:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
    model_repo="PaddleClas"          # (必选) 模型套件的名字
    speed_unit="samples/sec"         # (必选)速度指标单位
    skip_steps=10                  # (必选)解析日志，跳过模型前几个性能不稳定的step
    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
-    max_epochs=${7:-"1"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
-    num_workers=${8:-"4"}                  # (可选)
+    max_epochs=${6:-"1"}           # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件；或使用max_epoch参数
+    num_workers=${7:-"4"}                  # (可选)
 #   以下为通用执行命令，无特殊可不用修改
-    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
@@ -50,13 +49,19 @@ function _train(){

    train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config} -o Global.eval_during_train=False"
 #   以下为通用执行命令，无特殊可不用修改
-    case ${run_process_type} in
-    SingleP) 
-	   train_cmd="python ppcls/static/train.py ${train_cmd}";;
-    MultiP)
-	   train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}";;
-    *) echo "choose run_process_type(SingleP or MultiP)"; exit 1;
+    case ${run_mode} in
+    DP) if [[ ${device_num} = "N1C1" ]];then
+            echo "run ${run_mode} ${device_num}"
+	        train_cmd="python ppcls/static/train.py ${train_cmd}"
+        else
+            rm -rf ./mylog
+            train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}"
+        fi
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
    esac
+
    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
    timeout 5m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
@@ -65,7 +70,7 @@ function _train(){
        echo -e "${model_name}, SUCCESS"
    fi
    # kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
-    if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi

--- a/test_tipc/test_train_inference_python.sh
+++ b/test_tipc/test_train_inference_python.sh
@@ -40,8 +40,8 @@ fpgm_key=$(func_parser_key "${lines[17]}")
 fpgm_trainer=$(func_parser_value "${lines[17]}")
 distill_key=$(func_parser_key "${lines[18]}")
 distill_trainer=$(func_parser_value "${lines[18]}")
-trainer_key1=$(func_parser_key "${lines[19]}")
-trainer_value1=$(func_parser_value "${lines[19]}")
+to_static_key=$(func_parser_key "${lines[19]}")
+to_static_trainer=$(func_parser_value "${lines[19]}")
 trainer_key2=$(func_parser_key "${lines[20]}")
 trainer_value2=$(func_parser_value "${lines[20]}")

@@ -246,9 +246,12 @@ else
                elif [ ${trainer} = "${distill_key}" ]; then
                    run_train=${distill_trainer}
                    run_export=${distill_export}
-                elif [ ${trainer} = ${trainer_key1} ]; then
-                    run_train=${trainer_value1}
-                    run_export=${export_value1}
+                # In case of @to_static, we re-used norm_traier,
+                # but append "-o Global.to_static=True" for config
+                # to trigger "apply_to_static" logic in 'engine.py'
+                elif [ ${trainer} = "${to_static_key}" ]; then
+                    run_train="${norm_trainer}  ${to_static_trainer}"
+                    run_export=${norm_export}
                elif [[ ${trainer} = ${trainer_key2} ]]; then
                    run_train=${trainer_value2}
                    run_export=${export_value2}