diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 9bf898fd79b6b1642ce20fabda3009708473c354..f9c736d56e0b1b2a9b0a270149404c6afd4ec2bf 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -86,17 +86,18 @@ PPOCRLabel --lang ch --kie True # 启动 【KIE 模式】,用于打【检测+ > 如果上述安装出现问题,可以参考3.6节 错误提示 -#### 1.2.2 本地构建whl包并安装 +#### 1.2.2 通过Python脚本运行PPOCRLabel + +如果您对PPOCRLabel文件有所更改(例如指定新的内置模型),通过Python脚本运行会更加方面的看到更改的结果。如果仍然需要通过whl包启动,则需要参考下节重新编译whl包。 ```bash -cd PaddleOCR/PPOCRLabel -python3 setup.py bdist_wheel -pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple +cd ./PPOCRLabel # 切换到PPOCRLabel目录 +python PPOCRLabel.py --lang ch ``` -#### 1.2.3 通过Python脚本运行PPOCRLabel +#### 1.2.3 本地构建whl包并安装 -如果您对PPOCRLabel文件有所更改,通过Python脚本运行会更加方面的看到更改的结果 +编译与安装新的whl包,其中1.0.2为版本号,可在 `setup.py` 中指定新版本。 ```bash cd ./PPOCRLabel # 切换到PPOCRLabel目录 @@ -107,7 +108,6 @@ python PPOCRLabel.py --lang ch --kie True # 启动 【KIE 模式】,用于打 ``` - ## 2. 使用 ### 2.1 操作步骤 diff --git a/deploy/cpp_infer/external-cmake/auto-log.cmake b/deploy/cpp_infer/external-cmake/auto-log.cmake index becbff0f45df51e5db541889ae1ffdacf2c4fc78..c998b3b14570aa77b9a307b0477f4caa7160e2a5 100644 --- a/deploy/cpp_infer/external-cmake/auto-log.cmake +++ b/deploy/cpp_infer/external-cmake/auto-log.cmake @@ -6,6 +6,7 @@ set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}/third-party") FetchContent_Declare( extern_Autolog PREFIX autolog + # If you don't have access to github, replace it with https://gitee.com/Double_V/AutoLog GIT_REPOSITORY https://github.com/LDOUBLEV/AutoLog.git GIT_TAG main ) diff --git a/deploy/cpp_infer/include/ocr_det.h b/deploy/cpp_infer/include/ocr_det.h index 657ab25d8854ec54c27d71485fe9eeddc65013c3..6e4086fbaa6945b9f685e6844b7e701283de2dae 100644 --- a/deploy/cpp_infer/include/ocr_det.h +++ b/deploy/cpp_infer/include/ocr_det.h @@ -46,8 +46,7 @@ public: const double &det_db_box_thresh, const double &det_db_unclip_ratio, const bool &use_polygon_score, const bool &use_dilation, - const bool &visualize, const bool &use_tensorrt, - const std::string &precision) { + const bool &use_tensorrt, const std::string &precision) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; this->gpu_mem_ = gpu_mem; @@ -62,7 +61,6 @@ public: this->use_polygon_score_ = use_polygon_score; this->use_dilation_ = use_dilation; - this->visualize_ = visualize; this->use_tensorrt_ = use_tensorrt; this->precision_ = precision; diff --git a/deploy/cpp_infer/include/ocr_rec.h b/deploy/cpp_infer/include/ocr_rec.h index ff80ba5299014885fc4c900fb87b5dcc6042744a..4052553d967fb365c3fb895c9d5b8145935fd45d 100644 --- a/deploy/cpp_infer/include/ocr_rec.h +++ b/deploy/cpp_infer/include/ocr_rec.h @@ -44,7 +44,8 @@ public: const int &gpu_id, const int &gpu_mem, const int &cpu_math_library_num_threads, const bool &use_mkldnn, const string &label_path, - const bool &use_tensorrt, const std::string &precision, + const bool &use_tensorrt, + const std::string &precision, const int &rec_batch_num) { this->use_gpu_ = use_gpu; this->gpu_id_ = gpu_id; @@ -66,7 +67,8 @@ public: // Load Paddle inference model void LoadModel(const std::string &model_dir); - void Run(std::vector img_list, std::vector *times); + void Run(std::vector img_list, std::vector &rec_texts, + std::vector &rec_text_scores, std::vector *times); private: std::shared_ptr predictor_; @@ -85,7 +87,7 @@ private: bool use_tensorrt_ = false; std::string precision_ = "fp32"; int rec_batch_num_ = 6; - + // pre-process CrnnResizeImg resize_op_; Normalize normalize_op_; diff --git a/deploy/cpp_infer/include/utility.h b/deploy/cpp_infer/include/utility.h index 5797559f7550da6bb38b014c46c1492124a9e065..f0dddacdac31e979a96648433662c76ccf972ad2 100644 --- a/deploy/cpp_infer/include/utility.h +++ b/deploy/cpp_infer/include/utility.h @@ -38,7 +38,8 @@ public: static void VisualizeBboxes(const cv::Mat &srcimg, - const std::vector>> &boxes); + const std::vector>> &boxes, + const std::string &save_path); template inline static size_t argmax(ForwardIterator first, ForwardIterator last) { @@ -47,12 +48,13 @@ public: static void GetAllFiles(const char *dir_name, std::vector &all_inputs); - + static cv::Mat GetRotateCropImage(const cv::Mat &srcimage, - std::vector> box); - - static std::vector argsort(const std::vector& array); + std::vector> box); + + static std::vector argsort(const std::vector &array); + static std::string basename(const std::string &filename); }; } // namespace PaddleOCR \ No newline at end of file diff --git a/deploy/cpp_infer/readme.md b/deploy/cpp_infer/readme.md index 8ca0e4a8c6c0eb7d09312645b70291d7e8c8016e..e7104881027b111de6821af8244ea2a6092fc14b 100644 --- a/deploy/cpp_infer/readme.md +++ b/deploy/cpp_infer/readme.md @@ -30,7 +30,7 @@ PaddleOCR模型部署。 ### 1.0 运行准备 - Linux环境,推荐使用docker。 -- Windows环境,目前支持基于`Visual Studio 2019 Community`进行编译。 +- Windows环境。 * 该文档主要介绍基于Linux环境的PaddleOCR C++预测流程,如果需要在Windows下基于预测库进行C++预测,具体编译方法请参考[Windows下编译教程](./docs/windows_vs2019_build.md) @@ -256,6 +256,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir |gpu_mem|int|4000|申请的GPU内存| |cpu_math_library_num_threads|int|10|CPU预测时的线程数,在机器核数充足的情况下,该值越大,预测速度越快| |enable_mkldnn|bool|true|是否使用mkldnn库| +|output|str|./output|可视化结果保存的路径| - 检测模型相关 @@ -267,7 +268,7 @@ CUDNN_LIB_DIR=/your_cudnn_lib_dir |det_db_box_thresh|float|0.5|DB后处理过滤box的阈值,如果检测存在漏框情况,可酌情减小| |det_db_unclip_ratio|float|1.6|表示文本框的紧致程度,越小则文本框更靠近文本| |use_polygon_score|bool|false|是否使用多边形框计算bbox score,false表示使用矩形框计算。矩形框计算速度更快,多边形框对弯曲文本区域计算更准确。| -|visualize|bool|true|是否对结果进行可视化,为1时,会在当前文件夹下保存文件名为`ocr_vis.png`的预测结果。| +|visualize|bool|true|是否对结果进行可视化,为1时,预测结果会保存在`output`字段指定的文件夹下和输入图像同名的图像上。| - 方向分类器相关 diff --git a/deploy/cpp_infer/readme_en.md b/deploy/cpp_infer/readme_en.md index 55160ede6bdd2f387124021f9ff25cdfb6b5a23a..61d65095394a9f5b7323bf8eb7324cd1e91b1346 100644 --- a/deploy/cpp_infer/readme_en.md +++ b/deploy/cpp_infer/readme_en.md @@ -26,6 +26,7 @@ This section will introduce how to configure the C++ environment and deploy Padd ### Environment - Linux, docker is recommended. +- Windows. ### 1.1 Compile OpenCV @@ -248,6 +249,7 @@ More parameters are as follows, |gpu_mem|int|4000|GPU memory requested| |cpu_math_library_num_threads|int|10|Number of threads when using CPU inference. When machine cores is enough, the large the value, the faster the inference speed| |enable_mkldnn|bool|true|Whether to use mkdlnn library| +|output|str|./output|Path where visualization results are saved| - Detection related parameters @@ -259,7 +261,7 @@ More parameters are as follows, |det_db_box_thresh|float|0.5|DB post-processing filter box threshold, if there is a missing box detected, it can be reduced as appropriate| |det_db_unclip_ratio|float|1.6|Indicates the compactness of the text box, the smaller the value, the closer the text box to the text| |use_polygon_score|bool|false|Whether to use polygon box to calculate bbox score, false means to use rectangle box to calculate. Use rectangular box to calculate faster, and polygonal box more accurate for curved text area.| -|visualize|bool|true|Whether to visualize the results,when it is set as true, The prediction result will be save in the image file `./ocr_vis.png`.| +|visualize|bool|true|Whether to visualize the results,when it is set as true, the prediction results will be saved in the folder specified by the `output` field on an image with the same name as the input image.| - Classifier related parameters diff --git a/deploy/cpp_infer/src/main.cpp b/deploy/cpp_infer/src/main.cpp index 31d0685f543a1441eab8b9d2595d008ff65763f8..efc1e50ce929b4f68dff3437faa05b9ac46c2aa0 100644 --- a/deploy/cpp_infer/src/main.cpp +++ b/deploy/cpp_infer/src/main.cpp @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "glog/logging.h" #include "omp.h" #include "opencv2/core.hpp" #include "opencv2/imgcodecs.hpp" @@ -21,13 +20,13 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -45,7 +44,7 @@ DEFINE_bool(enable_mkldnn, false, "Whether use mkldnn with CPU."); DEFINE_bool(use_tensorrt, false, "Whether use tensorrt."); DEFINE_string(precision, "fp32", "Precision be one of fp32/fp16/int8"); DEFINE_bool(benchmark, false, "Whether use benchmark."); -DEFINE_string(save_log_path, "./log_output/", "Save benchmark log path."); +DEFINE_string(output, "./output/", "Save benchmark log path."); // detection related DEFINE_string(image_dir, "", "Dir of input image."); DEFINE_string(det_model_dir, "", "Path of det inference model."); @@ -86,11 +85,17 @@ int main_det(std::vector cv_all_img_names) { FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, - FLAGS_use_polygon_score, FLAGS_use_dilation, FLAGS_visualize, + FLAGS_use_polygon_score, FLAGS_use_dilation, FLAGS_use_tensorrt, FLAGS_precision); + if (!PathExists(FLAGS_output)) { + mkdir(FLAGS_output.c_str(), 0777); + } + for (int i = 0; i < cv_all_img_names.size(); ++i) { - // LOG(INFO) << "The predict img: " << cv_all_img_names[i]; + if (!FLAGS_benchmark) { + cout << "The predict img: " << cv_all_img_names[i] << endl; + } cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); if (!srcimg.data) { @@ -102,7 +107,11 @@ int main_det(std::vector cv_all_img_names) { std::vector det_times; det.Run(srcimg, boxes, &det_times); - + // visualization + if (FLAGS_visualize) { + std::string file_name = Utility::basename(cv_all_img_names[i]); + Utility::VisualizeBboxes(srcimg, boxes, FLAGS_output + "/" + file_name); + } time_info[0] += det_times[0]; time_info[1] += det_times[1]; time_info[2] += det_times[2]; @@ -142,8 +151,6 @@ int main_rec(std::vector cv_all_img_names) { std::vector img_list; for (int i = 0; i < cv_all_img_names.size(); ++i) { - LOG(INFO) << "The predict img: " << cv_all_img_names[i]; - cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); if (!srcimg.data) { std::cerr << "[ERROR] image read failed! image path: " @@ -152,8 +159,15 @@ int main_rec(std::vector cv_all_img_names) { } img_list.push_back(srcimg); } + std::vector rec_texts(img_list.size(), ""); + std::vector rec_text_scores(img_list.size(), 0); std::vector rec_times; - rec.Run(img_list, &rec_times); + rec.Run(img_list, rec_texts, rec_text_scores, &rec_times); + // output rec results + for (int i = 0; i < rec_texts.size(); i++) { + cout << "The predict img: " << cv_all_img_names[i] << "\t" << rec_texts[i] + << "\t" << rec_text_scores[i] << endl; + } time_info[0] += rec_times[0]; time_info[1] += rec_times[1]; time_info[2] += rec_times[2]; @@ -172,11 +186,15 @@ int main_system(std::vector cv_all_img_names) { std::vector time_info_det = {0, 0, 0}; std::vector time_info_rec = {0, 0, 0}; + if (!PathExists(FLAGS_output)) { + mkdir(FLAGS_output.c_str(), 0777); + } + DBDetector det(FLAGS_det_model_dir, FLAGS_use_gpu, FLAGS_gpu_id, FLAGS_gpu_mem, FLAGS_cpu_threads, FLAGS_enable_mkldnn, FLAGS_max_side_len, FLAGS_det_db_thresh, FLAGS_det_db_box_thresh, FLAGS_det_db_unclip_ratio, - FLAGS_use_polygon_score, FLAGS_use_dilation, FLAGS_visualize, + FLAGS_use_polygon_score, FLAGS_use_dilation, FLAGS_use_tensorrt, FLAGS_precision); Classifier *cls = nullptr; @@ -197,7 +215,7 @@ int main_system(std::vector cv_all_img_names) { FLAGS_rec_batch_num); for (int i = 0; i < cv_all_img_names.size(); ++i) { - LOG(INFO) << "The predict img: " << cv_all_img_names[i]; + cout << "The predict img: " << cv_all_img_names[i] << endl; cv::Mat srcimg = cv::imread(cv_all_img_names[i], cv::IMREAD_COLOR); if (!srcimg.data) { @@ -205,15 +223,21 @@ int main_system(std::vector cv_all_img_names) { << cv_all_img_names[i] << endl; exit(1); } + // det std::vector>> boxes; std::vector det_times; std::vector rec_times; det.Run(srcimg, boxes, &det_times); + if (FLAGS_visualize) { + std::string file_name = Utility::basename(cv_all_img_names[i]); + Utility::VisualizeBboxes(srcimg, boxes, FLAGS_output + "/" + file_name); + } time_info_det[0] += det_times[0]; time_info_det[1] += det_times[1]; time_info_det[2] += det_times[2]; + // rec std::vector img_list; for (int j = 0; j < boxes.size(); j++) { cv::Mat crop_img; @@ -223,8 +247,14 @@ int main_system(std::vector cv_all_img_names) { } img_list.push_back(crop_img); } - - rec.Run(img_list, &rec_times); + std::vector rec_texts(img_list.size(), ""); + std::vector rec_text_scores(img_list.size(), 0); + rec.Run(img_list, rec_texts, rec_text_scores, &rec_times); + // output rec results + for (int i = 0; i < rec_texts.size(); i++) { + std::cout << i << "\t" << rec_texts[i] << "\t" << rec_text_scores[i] + << std::endl; + } time_info_rec[0] += rec_times[0]; time_info_rec[1] += rec_times[1]; time_info_rec[2] += rec_times[2]; diff --git a/deploy/cpp_infer/src/ocr_det.cpp b/deploy/cpp_infer/src/ocr_det.cpp index ad78999449d94dcaf2e336087de5c6837f3b233c..d72dc40cddb0845c370f5ad4bb9b6e2f6fe0bf2f 100644 --- a/deploy/cpp_infer/src/ocr_det.cpp +++ b/deploy/cpp_infer/src/ocr_det.cpp @@ -175,11 +175,6 @@ void DBDetector::Run(cv::Mat &img, std::chrono::duration postprocess_diff = postprocess_end - postprocess_start; times->push_back(double(postprocess_diff.count() * 1000)); - - //// visualization - if (this->visualize_) { - Utility::VisualizeBboxes(srcimg, boxes); - } } } // namespace PaddleOCR diff --git a/deploy/cpp_infer/src/ocr_rec.cpp b/deploy/cpp_infer/src/ocr_rec.cpp index 25224f88acecd33f5efaa34a9dfc71639663d53f..4c94e8f3fc966d2a4de8c7aad0e5ef4d4b69c804 100644 --- a/deploy/cpp_infer/src/ocr_rec.cpp +++ b/deploy/cpp_infer/src/ocr_rec.cpp @@ -17,6 +17,8 @@ namespace PaddleOCR { void CRNNRecognizer::Run(std::vector img_list, + std::vector &rec_texts, + std::vector &rec_text_scores, std::vector *times) { std::chrono::duration preprocess_diff = std::chrono::steady_clock::now() - std::chrono::steady_clock::now(); @@ -86,7 +88,7 @@ void CRNNRecognizer::Run(std::vector img_list, // ctc decode auto postprocess_start = std::chrono::steady_clock::now(); for (int m = 0; m < predict_shape[0]; m++) { - std::vector str_res; + std::string str_res; int argmax_idx; int last_index = 0; float score = 0.f; @@ -104,17 +106,16 @@ void CRNNRecognizer::Run(std::vector img_list, if (argmax_idx > 0 && (!(n > 0 && argmax_idx == last_index))) { score += max_value; count += 1; - str_res.push_back(label_list_[argmax_idx]); + str_res += label_list_[argmax_idx]; } last_index = argmax_idx; } score /= count; - if (isnan(score)) + if (isnan(score)) { continue; - for (int i = 0; i < str_res.size(); i++) { - std::cout << str_res[i]; } - std::cout << "\tscore: " << score << std::endl; + rec_texts[indices[beg_img_no + m]] = str_res; + rec_text_scores[indices[beg_img_no + m]] = score; } auto postprocess_end = std::chrono::steady_clock::now(); postprocess_diff += postprocess_end - postprocess_start; diff --git a/deploy/cpp_infer/src/utility.cpp b/deploy/cpp_infer/src/utility.cpp index 6952be54eed14d06ddcf3572d9bd2f4153894534..034df07804745178368a621936cd1ddabfd3a050 100644 --- a/deploy/cpp_infer/src/utility.cpp +++ b/deploy/cpp_infer/src/utility.cpp @@ -40,7 +40,8 @@ std::vector Utility::ReadDict(const std::string &path) { void Utility::VisualizeBboxes( const cv::Mat &srcimg, - const std::vector>> &boxes) { + const std::vector>> &boxes, + const std::string &save_path) { cv::Mat img_vis; srcimg.copyTo(img_vis); for (int n = 0; n < boxes.size(); n++) { @@ -54,8 +55,8 @@ void Utility::VisualizeBboxes( cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); } - cv::imwrite("./ocr_vis.png", img_vis); - std::cout << "The detection visualized image saved in ./ocr_vis.png" + cv::imwrite(save_path, img_vis); + std::cout << "The detection visualized image saved in " + save_path << std::endl; } @@ -93,7 +94,7 @@ void Utility::GetAllFiles(const char *dir_name, } cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage, - std::vector> box) { + std::vector> box) { cv::Mat image; srcimage.copyTo(image); std::vector> points = box; @@ -147,17 +148,52 @@ cv::Mat Utility::GetRotateCropImage(const cv::Mat &srcimage, } } -std::vector Utility::argsort(const std::vector& array) -{ - const int array_len(array.size()); - std::vector array_index(array_len, 0); - for (int i = 0; i < array_len; ++i) - array_index[i] = i; +std::vector Utility::argsort(const std::vector &array) { + const int array_len(array.size()); + std::vector array_index(array_len, 0); + for (int i = 0; i < array_len; ++i) + array_index[i] = i; - std::sort(array_index.begin(), array_index.end(), - [&array](int pos1, int pos2) {return (array[pos1] < array[pos2]); }); + std::sort( + array_index.begin(), array_index.end(), + [&array](int pos1, int pos2) { return (array[pos1] < array[pos2]); }); - return array_index; + return array_index; +} + +std::string Utility::basename(const std::string &filename) { + if (filename.empty()) { + return ""; + } + + auto len = filename.length(); + auto index = filename.find_last_of("/\\"); + + if (index == std::string::npos) { + return filename; + } + + if (index + 1 >= len) { + + len--; + index = filename.substr(0, len).find_last_of("/\\"); + + if (len == 0) { + return filename; + } + + if (index == 0) { + return filename.substr(1, len - 1); + } + + if (index == std::string::npos) { + return filename.substr(0, len); + } + + return filename.substr(index + 1, len - index - 1); + } + + return filename.substr(index + 1, len - index); } } // namespace PaddleOCR \ No newline at end of file diff --git a/doc/doc_ch/finetune.md b/doc/doc_ch/finetune.md new file mode 100644 index 0000000000000000000000000000000000000000..e8f146aadc079444c37e000d16ada8b6bda8ba18 --- /dev/null +++ b/doc/doc_ch/finetune.md @@ -0,0 +1,170 @@ +# 模型微调 + +## 1. 模型微调背景与意义 + +PaddleOCR提供的PP-OCR系列模型在通用场景中性能优异,能够解决绝大多数情况下的检测与识别问题。在垂类场景中,如果希望获取更优的模型效果,可以通过模型微调的方法,进一步提升PP-OCR系列检测与识别模型的精度。 + +本文主要介绍文本检测与识别模型在模型微调时的一些注意事项,最终希望您在自己的场景中,通过模型微调,可以获取精度更高的文本检测与识别模型。 + +本文核心要点如下所示。 + +1. PP-OCR提供的预训练模型有较好的泛化能力 +2. 加入少量真实数据(检测任务>=500张, 识别任务>=5000张),会大幅提升垂类场景的检测与识别效果 +3. 在模型微调时,加入真实通用场景数据,可以进一步提升模型精度与泛化性能 +4. 在图像检测任务中,增大图像的预测尺度,能够进一步提升较小文字区域的检测效果 +5. 在模型微调时,需要适当调整超参数(学习率,batch size最为重要),以获得更优的微调效果。 + +更多详细内容,请参考第2章与第3章。 + +## 2. 文本检测模型微调 + +### 2.1 数据选择 + +* 数据量:建议至少准备500张的文本检测数据集用于模型微调。 + +* 数据标注:单行文本标注格式,建议标注的检测框与实际语义内容一致。如在火车票场景中,姓氏与名字可能离得较远,但是它们在语义上属于同一个检测字段,这里也需要将整个姓名标注为1个检测框。 + +### 2.2 模型选择 + +建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_det_student.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml),预训练模型:[ch_PP-OCRv2_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 + +更多PP-OCR系列模型,请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 + +注意:在使用上述预训练模型的时候,由于保存的模型中包含教师模型,因此需要将其中的学生模型单独提取出来,再加载学生模型即可进行模型微调。 + +```python +import paddle +# 加载完整的检测预训练模型 +a = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams") +# 提取学生模型的参数 +b = {k[len("student_model."):]: a[k] for k in a if "student_model." in k} +# 保存模型,用于后续模型微调 +paddle.save(b, "ch_PP-OCRv2_det_student.pdparams") +``` + + +### 2.3 训练超参选择 + +在模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`,部分配置文件如下所示。 + +```yaml +Global: + pretrained_model: ./pretrain_models/student.pdparams # 预训练模型路径 +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # 学习率 + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +Train: + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 # 单卡batch size + num_workers: 4 +``` + +上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中提取出来的`ch_PP-OCRv2_det_student.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*8=64`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 + +* 如果您的场景中是单卡训练,单卡batch_size=8,则总的batch_size=8,建议将学习率调整为`1e-4`左右。 +* 如果您的场景中是单卡训练,由于显存限制,只能设置单卡batch_size=4,则总的batch_size=4,建议将学习率调整为`5e-5`左右。 + +### 2.4 预测超参选择 + +对训练好的模型导出并进行推理时,可以通过进一步调整预测的图像尺度,来提升小面积文本的检测效果,下面是DBNet推理时的一些超参数,可以通过适当调整,提升效果。 + +| 参数名称 | 类型 | 默认值 | 含义 | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | DB输出的概率图中,得分大于该阈值的像素点才会被认为是文字像素点 | +| det_db_box_thresh | float | 0.6 | 检测结果边框内,所有像素点的平均得分大于该阈值时,该结果会被认为是文字区域 | +| det_db_unclip_ratio | float | 1.5 | `Vatti clipping`算法的扩张系数,使用该方法对文字区域进行扩张 | +| max_batch_size | int | 10 | 预测的batch size | +| use_dilation | bool | False | 是否对分割结果进行膨胀以获取更优检测效果 | +| det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | + + +更多关于推理方法的介绍可以参考[Paddle Inference推理教程](./inference.md)。 + + +## 3. 文本识别模型微调 + + +### 3.1 数据选择 + +* 数据量:不更换字典的情况下,建议至少准备5000张的文本识别数据集用于模型微调;如果更换了字典(不建议),需要的数量更多。 + +* 数据分布:建议分布与实测场景尽量一致。如果实测场景包含大量短文本,则训练数据中建议也包含较多短文本,如果实测场景对于空格识别效果要求较高,则训练数据中建议也包含较多带空格的文本内容。 + + +* 通用中英文数据:在训练的时候,可以在训练集中添加通用真实数据(如在不更换字典的微调场景中,建议添加LSVT、RCTW、MTWI等真实数据),进一步提升模型的泛化性能。 + +### 3.2 模型选择 + +建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml),预训练模型:[ch_PP-OCRv2_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 + +更多PP-OCR系列,模型请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 + + +### 3.3 训练超参选择 + +与文本检测任务微调相同,在识别模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`与`batch_size`,部分默认配置文件如下所示。 + +```yaml +Global: + pretrained_model: # 预训练模型路径 +Optimizer: + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] # 学习率 + warmup_epoch: 5 + regularizer: + name: 'L2' + factor: 0 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: [1.0] # 采样比例,默认值是[1.0] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 128 # 单卡batch size + num_workers: 8 + +``` + + +上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中解压得到的`ch_PP-OCRv2_rec_train/best_accuracy.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*128=1024`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如: + +* 如果您的场景中是单卡训练,单卡batch_size=128,则总的batch_size=128,在加载预训练模型的情况下,建议将学习率调整为`[1e-4, 2e-5]`左右(piecewise学习率策略,需设置2个值,下同)。 +* 如果您的场景中是单卡训练,因为显存限制,只能设置单卡batch_size=64,则总的batch_size=64,在加载预训练模型的情况下,建议将学习率调整为`[5e-5, 1e-5]`左右。 + + +如果有通用真实场景数据加进来,建议每个epoch中,垂类场景数据与真实场景的数据量保持在1:1左右。 + +比如:您自己的垂类场景识别数据量为1W,数据标签文件为`vertical.txt`,收集到的通用场景识别数据量为10W,数据标签文件为`general.txt`, + + +那么,可以设置`label_file_list`和`ratio_list`参数如下所示。每个epoch中,`vertical.txt`中会进行全采样(采样比例为1.0),包含1W条数据;`general.txt`中会按照0.1的采样比例进行采样,包含`10W*0.1=1W`条数据,最终二者的比例为`1:1`。 + +```yaml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - vertical.txt + - general.txt + ratio_list: [1.0, 0.1] +``` diff --git a/doc/doc_ch/inference.md b/doc/doc_ch/inference.md index c02da14af495cd807668dca6d7f3823d1de6820d..ade1a2dbdf728ac785efef3e5a82b4c932674b87 100755 --- a/doc/doc_ch/inference.md +++ b/doc/doc_ch/inference.md @@ -36,6 +36,8 @@ inference 模型(`paddle.jit.save`保存的模型) - [六、参数解释](#参数解释) +- [七、FAQ](#FAQ) + ## 一、训练模型转inference模型 @@ -520,3 +522,9 @@ PSE算法相关参数如下 | label_list | list | ['0', '180'] | class id对应的角度值 | | cls_batch_num | int | 6 | 方向分类器预测的batch size | | cls_thresh | float | 0.9 | 预测阈值,模型预测结果为180度,且得分大于该阈值时,认为最终预测结果为180度,需要翻转 | + + + +# 七、FAQ + +* 如果是使用paddle2.0之前版本的代码导出的`inference模型`,则其文件名为`model`与`params`,分别对应paddle2.0或者之后版本导出的`inference.pdmodel`与`inference.pdiparams`;不过目前PaddleOCR的release分支已经不支持paddle2.0之前版本导出的inference 模型,如果希望使用,需要使用develop分支(静态图分支)的代码与文档。 diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index f030b0ee5302256f571462a13d0a271873d26a30..cf55af29e7b6a0c92022b35746081776451627a0 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -247,7 +247,10 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t | rec_r31_sar.yml | SAR | ResNet31 | None | LSTM encoder | LSTM decoder | | rec_resnet_stn_bilstm_att.yml | SEED | Aster_Resnet | STN | BiLSTM | att | -*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) +*其中SEED模型需要额外加载FastText训练好的[语言模型](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz) ,并且安装 fasttext 依赖: +``` +python3.7 -m pip install fasttext==0.9.1 +``` 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: diff --git a/doc/joinus.PNG b/doc/joinus.PNG index c3b26cf4814417b646785b42c7fcbaa0350edac6..5838a96bc8317178de07a16d246966bf6cc7df63 100644 Binary files a/doc/joinus.PNG and b/doc/joinus.PNG differ diff --git a/paddleocr.py b/paddleocr.py index 3a06158b0c92ea70e4320646d38e8c9e9295e9db..d07082f0ddc1133b3e9b3a7a7703d87f7cfeeedb 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -47,7 +47,7 @@ __all__ = [ ] SUPPORT_DET_MODEL = ['DB'] -VERSION = '2.4.0.3' +VERSION = '2.4.0.4' SUPPORT_REC_MODEL = ['CRNN'] BASE_DIR = os.path.expanduser("~/.paddleocr/") diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py index a46cce7de2c8e59cf797db96fc6fcb7e25fa549a..3b7674268772d8a332b963fd6b82dfb71ee40212 100644 --- a/ppocr/modeling/heads/rec_sar_head.py +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -216,7 +216,7 @@ class ParallelSARDecoder(BaseDecoder): self.pred_dropout = nn.Dropout(pred_dropout) pred_num_classes = self.num_classes - 1 if pred_concat: - fc_in_channel = decoder_rnn_out_size + d_model + d_enc + fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size else: fc_in_channel = d_model self.prediction = nn.Linear(fc_in_channel, pred_num_classes) diff --git a/ppstructure/vqa/README.md b/ppstructure/vqa/README.md index b9a82cc5fd971800aaebd9bc4553ba6f0700845e..f142778506ee53ee8955f078b0116f033522a4e6 100644 --- a/ppstructure/vqa/README.md +++ b/ppstructure/vqa/README.md @@ -242,3 +242,7 @@ python3 tools/infer_vqa_token_ser_re.py -c configs/vqa/re/layoutxlm.yml -o Archi - LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf - microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm - XFUND dataset, https://github.com/doc-analysis/XFUND + +## License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/requirements.txt b/requirements.txt index 1d9522aa0167c60ffce263a35b86640efb1438b2..b60d48371337e38bde6e51171aa6ecfb9573fb4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,3 @@ cython lxml premailer openpyxl -fasttext==0.9.1