// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle_api.h" // NOLINT #include #include "cls_process.h" #include "crnn_process.h" #include "db_post_process.h" using namespace paddle::lite_api; // NOLINT using namespace std; // fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up void NeonMeanScale(const float *din, float *dout, int size, const std::vector mean, const std::vector scale) { if (mean.size() != 3 || scale.size() != 3) { std::cerr << "[ERROR] mean or scale size must equal to 3\n"; exit(1); } float32x4_t vmean0 = vdupq_n_f32(mean[0]); float32x4_t vmean1 = vdupq_n_f32(mean[1]); float32x4_t vmean2 = vdupq_n_f32(mean[2]); float32x4_t vscale0 = vdupq_n_f32(scale[0]); float32x4_t vscale1 = vdupq_n_f32(scale[1]); float32x4_t vscale2 = vdupq_n_f32(scale[2]); float *dout_c0 = dout; float *dout_c1 = dout + size; float *dout_c2 = dout + size * 2; int i = 0; for (; i < size - 3; i += 4) { float32x4x3_t vin3 = vld3q_f32(din); float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); float32x4_t vs0 = vmulq_f32(vsub0, vscale0); float32x4_t vs1 = vmulq_f32(vsub1, vscale1); float32x4_t vs2 = vmulq_f32(vsub2, vscale2); vst1q_f32(dout_c0, vs0); vst1q_f32(dout_c1, vs1); vst1q_f32(dout_c2, vs2); din += 12; dout_c0 += 4; dout_c1 += 4; dout_c2 += 4; } for (; i < size; i++) { *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; *(dout_c1++) = (*(din++) - mean[1]) * scale[1]; *(dout_c2++) = (*(din++) - mean[2]) * scale[2]; } } // resize image to a size multiple of 32 which is required by the network cv::Mat DetResizeImg(const cv::Mat img, int max_size_len, std::vector &ratio_hw) { int w = img.cols; int h = img.rows; float ratio = 1.f; int max_wh = w >= h ? w : h; if (max_wh > max_size_len) { if (h > w) { ratio = static_cast(max_size_len) / static_cast(h); } else { ratio = static_cast(max_size_len) / static_cast(w); } } int resize_h = static_cast(float(h) * ratio); int resize_w = static_cast(float(w) * ratio); if (resize_h % 32 == 0) resize_h = resize_h; else if (resize_h / 32 < 1 + 1e-5) resize_h = 32; else resize_h = (resize_h / 32 - 1) * 32; if (resize_w % 32 == 0) resize_w = resize_w; else if (resize_w / 32 < 1 + 1e-5) resize_w = 32; else resize_w = (resize_w / 32 - 1) * 32; cv::Mat resize_img; cv::resize(img, resize_img, cv::Size(resize_w, resize_h)); ratio_hw.push_back(static_cast(resize_h) / static_cast(h)); ratio_hw.push_back(static_cast(resize_w) / static_cast(w)); return resize_img; } cv::Mat RunClsModel(cv::Mat img, std::shared_ptr predictor_cls, const float thresh = 0.9) { std::vector mean = {0.5f, 0.5f, 0.5f}; std::vector scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; cv::Mat srcimg; img.copyTo(srcimg); cv::Mat crop_img; img.copyTo(crop_img); cv::Mat resize_img; int index = 0; float wh_ratio = static_cast(crop_img.cols) / static_cast(crop_img.rows); resize_img = ClsResizeImg(crop_img); resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f); const float *dimg = reinterpret_cast(resize_img.data); std::unique_ptr input_tensor0(std::move(predictor_cls->GetInput(0))); input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols}); auto *data0 = input_tensor0->mutable_data(); NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale); // Run CLS predictor predictor_cls->Run(); // Get output and run postprocess std::unique_ptr softmax_out( std::move(predictor_cls->GetOutput(0))); auto *softmax_scores = softmax_out->mutable_data(); auto softmax_out_shape = softmax_out->shape(); float score = 0; int label = 0; for (int i = 0; i < softmax_out_shape[1]; i++) { if (softmax_scores[i] > score) { score = softmax_scores[i]; label = i; } } if (label % 2 == 1 && score > thresh) { cv::rotate(srcimg, srcimg, 1); } return srcimg; } void RunRecModel(std::vector>> boxes, cv::Mat img, std::shared_ptr predictor_crnn, std::vector &rec_text, std::vector &rec_text_score, std::vector charactor_dict, std::shared_ptr predictor_cls, int use_direction_classify) { std::vector mean = {0.5f, 0.5f, 0.5f}; std::vector scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f}; cv::Mat srcimg; img.copyTo(srcimg); cv::Mat crop_img; cv::Mat resize_img; int index = 0; for (int i = boxes.size() - 1; i >= 0; i--) { crop_img = GetRotateCropImage(srcimg, boxes[i]); if (use_direction_classify >= 1) { crop_img = RunClsModel(crop_img, predictor_cls); } float wh_ratio = static_cast(crop_img.cols) / static_cast(crop_img.rows); resize_img = CrnnResizeImg(crop_img, wh_ratio); resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f); const float *dimg = reinterpret_cast(resize_img.data); std::unique_ptr input_tensor0( std::move(predictor_crnn->GetInput(0))); input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols}); auto *data0 = input_tensor0->mutable_data(); NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale); //// Run CRNN predictor predictor_crnn->Run(); // Get output and run postprocess std::unique_ptr output_tensor0( std::move(predictor_crnn->GetOutput(0))); auto *predict_batch = output_tensor0->data(); auto predict_shape = output_tensor0->shape(); // ctc decode std::string str_res; int argmax_idx; int last_index = 0; float score = 0.f; int count = 0; float max_value = 0.0f; for (int n = 0; n < predict_shape[1]; n++) { argmax_idx = int(Argmax(&predict_batch[n * predict_shape[2]], &predict_batch[(n + 1) * predict_shape[2]])); max_value = float(*std::max_element(&predict_batch[n * predict_shape[2]], &predict_batch[(n + 1) * predict_shape[2]])); if (argmax_idx > 0 && (!(n > 0 && argmax_idx == last_index))) { score += max_value; count += 1; str_res += charactor_dict[argmax_idx]; } last_index = argmax_idx; } score /= count; rec_text.push_back(str_res); rec_text_score.push_back(score); } } std::vector>> RunDetModel(std::shared_ptr predictor, cv::Mat img, std::map Config) { // Read img int max_side_len = int(Config["max_side_len"]); int det_db_use_dilate = int(Config["det_db_use_dilate"]); cv::Mat srcimg; img.copyTo(srcimg); std::vector ratio_hw; img = DetResizeImg(img, max_side_len, ratio_hw); cv::Mat img_fp; img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f); // Prepare input data from image std::unique_ptr input_tensor0(std::move(predictor->GetInput(0))); input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols}); auto *data0 = input_tensor0->mutable_data(); std::vector mean = {0.485f, 0.456f, 0.406f}; std::vector scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f}; const float *dimg = reinterpret_cast(img_fp.data); NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale); // Run predictor predictor->Run(); // Get output and post process std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); auto *outptr = output_tensor->data(); auto shape_out = output_tensor->shape(); // Save output float pred[shape_out[2] * shape_out[3]]; unsigned char cbuf[shape_out[2] * shape_out[3]]; for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) { pred[i] = static_cast(outptr[i]); cbuf[i] = static_cast((outptr[i]) * 255); } cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1, reinterpret_cast(cbuf)); cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F, reinterpret_cast(pred)); const double threshold = double(Config["det_db_thresh"]) * 255; const double max_value = 255; cv::Mat bit_map; cv::threshold(cbuf_map, bit_map, threshold, max_value, cv::THRESH_BINARY); if (det_db_use_dilate == 1) { cv::Mat dilation_map; cv::Mat dila_ele = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2)); cv::dilate(bit_map, dilation_map, dila_ele); bit_map = dilation_map; } auto boxes = BoxesFromBitmap(pred_map, bit_map, Config); std::vector>> filter_boxes = FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg); return filter_boxes; } std::shared_ptr loadModel(std::string model_file) { MobileConfig config; config.set_model_from_file(model_file); std::shared_ptr predictor = CreatePaddlePredictor(config); return predictor; } cv::Mat Visualization(cv::Mat srcimg, std::vector>> boxes) { cv::Point rook_points[boxes.size()][4]; for (int n = 0; n < boxes.size(); n++) { for (int m = 0; m < boxes[0].size(); m++) { rook_points[n][m] = cv::Point(static_cast(boxes[n][m][0]), static_cast(boxes[n][m][1])); } } cv::Mat img_vis; srcimg.copyTo(img_vis); for (int n = 0; n < boxes.size(); n++) { const cv::Point *ppt[1] = {rook_points[n]}; int npt[] = {4}; cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0); } cv::imwrite("./vis.jpg", img_vis); std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl; return img_vis; } std::vector split(const std::string &str, const std::string &delim) { std::vector res; if ("" == str) return res; char *strs = new char[str.length() + 1]; std::strcpy(strs, str.c_str()); char *d = new char[delim.length() + 1]; std::strcpy(d, delim.c_str()); char *p = std::strtok(strs, d); while (p) { string s = p; res.push_back(s); p = std::strtok(NULL, d); } return res; } std::map LoadConfigTxt(std::string config_path) { auto config = ReadDict(config_path); std::map dict; for (int i = 0; i < config.size(); i++) { std::vector res = split(config[i], " "); dict[res[0]] = stod(res[1]); } return dict; } int main(int argc, char **argv) { if (argc < 5) { std::cerr << "[ERROR] usage: " << argv[0] << " det_model_file cls_model_file rec_model_file image_path " "charactor_dict\n"; exit(1); } std::string det_model_file = argv[1]; std::string rec_model_file = argv[2]; std::string cls_model_file = argv[3]; std::string img_path = argv[4]; std::string dict_path = argv[5]; //// load config from txt file auto Config = LoadConfigTxt("./config.txt"); int use_direction_classify = int(Config["use_direction_classify"]); auto start = std::chrono::system_clock::now(); auto det_predictor = loadModel(det_model_file); auto rec_predictor = loadModel(rec_model_file); auto cls_predictor = loadModel(cls_model_file); auto charactor_dict = ReadDict(dict_path); charactor_dict.insert(charactor_dict.begin(), "#"); // blank char for ctc charactor_dict.push_back(" "); cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR); auto boxes = RunDetModel(det_predictor, srcimg, Config); std::vector rec_text; std::vector rec_text_score; RunRecModel(boxes, srcimg, rec_predictor, rec_text, rec_text_score, charactor_dict, cls_predictor, use_direction_classify); auto end = std::chrono::system_clock::now(); auto duration = std::chrono::duration_cast(end - start); //// visualization auto img_vis = Visualization(srcimg, boxes); //// print recognized text for (int i = 0; i < rec_text.size(); i++) { std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i] << std::endl; } std::cout << "花费了" << double(duration.count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den << "秒" << std::endl; return 0; }