ocr_db_crnn.cc 13.6 KB
Newer Older
L
LDOUVLEV 已提交
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
L
LDOUBLEV 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle_api.h" // NOLINT
L
LDOUBLEV 已提交
16 17
#include <chrono>

W
WenmuZhou 已提交
18
#include "cls_process.h"
L
LDOUVLEV 已提交
19 20
#include "crnn_process.h"
#include "db_post_process.h"
L
LDOUBLEV 已提交
21

22
using namespace paddle::lite_api; // NOLINT
L
LDOUVLEV 已提交
23
using namespace std;
L
LDOUBLEV 已提交
24 25

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
L
LDOUBLEV 已提交
26 27 28
void NeonMeanScale(const float *din, float *dout, int size,
                   const std::vector<float> mean,
                   const std::vector<float> scale) {
L
LDOUBLEV 已提交
29 30 31 32 33 34 35 36 37 38 39
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

40 41 42
  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;
L
LDOUBLEV 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
70 71
cv::Mat DetResizeImg(const cv::Mat img, int max_size_len,
                     std::vector<float> &ratio_hw) {
L
LDOUBLEV 已提交
72 73 74 75
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
L
LDOUVLEV 已提交
76 77 78
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
L
LDOUBLEV 已提交
79
      ratio = static_cast<float>(max_size_len) / static_cast<float>(h);
L
LDOUBLEV 已提交
80
    } else {
L
LDOUBLEV 已提交
81
      ratio = static_cast<float>(max_size_len) / static_cast<float>(w);
L
LDOUBLEV 已提交
82 83 84
    }
  }

L
LDOUBLEV 已提交
85 86
  int resize_h = static_cast<int>(float(h) * ratio);
  int resize_w = static_cast<int>(float(w) * ratio);
L
LDOUBLEV 已提交
87 88
  if (resize_h % 32 == 0)
    resize_h = resize_h;
L
LDOUVLEV 已提交
89
  else if (resize_h / 32 < 1 + 1e-5)
L
LDOUBLEV 已提交
90 91 92 93 94 95
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
L
LDOUVLEV 已提交
96
  else if (resize_w / 32 < 1 + 1e-5)
L
LDOUBLEV 已提交
97 98
    resize_w = 32;
  else
L
LDOUVLEV 已提交
99
    resize_w = (resize_w / 32 - 1) * 32;
L
LDOUBLEV 已提交
100 101 102 103

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

L
LDOUBLEV 已提交
104 105
  ratio_hw.push_back(static_cast<float>(resize_h) / static_cast<float>(h));
  ratio_hw.push_back(static_cast<float>(resize_w) / static_cast<float>(w));
L
LDOUBLEV 已提交
106 107 108
  return resize_img;
}

W
WenmuZhou 已提交
109 110 111 112 113 114 115 116
cv::Mat RunClsModel(cv::Mat img, std::shared_ptr<PaddlePredictor> predictor_cls,
                    const float thresh = 0.5) {
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
L
LDOUBLEV 已提交
117
  img.copyTo(crop_img);
W
WenmuZhou 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
  cv::Mat resize_img;

  int index = 0;
  float wh_ratio =
      static_cast<float>(crop_img.cols) / static_cast<float>(crop_img.rows);

  resize_img = ClsResizeImg(crop_img);
  resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

  const float *dimg = reinterpret_cast<const float *>(resize_img.data);

  std::unique_ptr<Tensor> input_tensor0(std::move(predictor_cls->GetInput(0)));
  input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
  auto *data0 = input_tensor0->mutable_data<float>();

  NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
  // Run CLS predictor
  predictor_cls->Run();

  // Get output and run postprocess
  std::unique_ptr<const Tensor> softmax_out(
      std::move(predictor_cls->GetOutput(0)));
  std::unique_ptr<const Tensor> label_out(
      std::move(predictor_cls->GetOutput(1)));
  auto *softmax_scores = softmax_out->mutable_data<float>();
  auto *label_idxs = label_out->data<int64>();
  int label_idx = label_idxs[0];
  float score = softmax_scores[label_idx];

  if (label_idx % 2 == 1 && score > thresh) {
    cv::rotate(srcimg, srcimg, 1);
  }
  return srcimg;
}

153
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
L
LDOUVLEV 已提交
154
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
L
update  
LDOUBLEV 已提交
155 156
                 std::vector<std::string> &rec_text,
                 std::vector<float> &rec_text_score,
W
WenmuZhou 已提交
157
                 std::vector<std::string> charactor_dict,
L
LDOUBLEV 已提交
158 159
                 std::shared_ptr<PaddlePredictor> predictor_cls,
                 int use_direction_classify) {
L
LDOUBLEV 已提交
160 161 162 163 164 165 166 167 168
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

  int index = 0;
L
LDOUVLEV 已提交
169 170
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
L
LDOUBLEV 已提交
171 172 173
    if (use_direction_classify >= 1) {
      crop_img = RunClsModel(crop_img, predictor_cls);
    }
L
LDOUBLEV 已提交
174 175
    float wh_ratio =
        static_cast<float>(crop_img.cols) / static_cast<float>(crop_img.rows);
L
LDOUBLEV 已提交
176

177
    resize_img = CrnnResizeImg(crop_img, wh_ratio);
L
LDOUBLEV 已提交
178 179
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

180
    const float *dimg = reinterpret_cast<const float *>(resize_img.data);
L
LDOUBLEV 已提交
181

L
LDOUVLEV 已提交
182 183
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
L
LDOUBLEV 已提交
184
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
185
    auto *data0 = input_tensor0->mutable_data<float>();
L
LDOUBLEV 已提交
186

L
LDOUBLEV 已提交
187
    NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
L
LDOUBLEV 已提交
188 189 190 191 192
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
L
LDOUVLEV 已提交
193
        std::move(predictor_crnn->GetOutput(0)));
L
LDOUBLEV 已提交
194
    auto *rec_idx = output_tensor0->data<int64>();
L
LDOUBLEV 已提交
195 196 197 198 199

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
L
LDOUBLEV 已提交
200 201 202
    for (int n = static_cast<int>(rec_idx_lod[0][0]);
         n < static_cast<int>(rec_idx_lod[0][1]); n += 1) {
      pred_idx.push_back(static_cast<int>(rec_idx[n]));
L
LDOUBLEV 已提交
203 204
    }

205 206
    if (pred_idx.size() < 1e-3)
      continue;
L
LDOUBLEV 已提交
207 208

    index += 1;
L
LDOUVLEV 已提交
209
    std::string pred_txt = "";
L
LDOUBLEV 已提交
210
    for (int n = 0; n < pred_idx.size(); n++) {
L
LDOUVLEV 已提交
211
      pred_txt += charactor_dict[pred_idx[n]];
L
LDOUBLEV 已提交
212
    }
L
LDOUVLEV 已提交
213
    rec_text.push_back(pred_txt);
L
LDOUBLEV 已提交
214 215

    ////get score
L
LDOUVLEV 已提交
216 217
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
218
    auto *predict_batch = output_tensor1->data<float>();
L
LDOUBLEV 已提交
219 220 221 222 223 224 225 226 227
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
L
LDOUBLEV 已提交
228 229 230 231
      int argmax_idx =
          static_cast<int>(Argmax(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
      float max_value =
L
LDOUVLEV 已提交
232 233
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
L
LDOUBLEV 已提交
234 235 236 237 238 239 240

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
L
LDOUVLEV 已提交
241
    rec_text_score.push_back(score);
L
LDOUBLEV 已提交
242 243 244
  }
}

245 246 247
std::vector<std::vector<std::vector<int>>>
RunDetModel(std::shared_ptr<PaddlePredictor> predictor, cv::Mat img,
            std::map<std::string, double> Config) {
L
LDOUBLEV 已提交
248
  // Read img
L
LDOUVLEV 已提交
249
  int max_side_len = int(Config["max_side_len"]);
L
LDOUBLEV 已提交
250 251 252 253

  cv::Mat srcimg;
  img.copyTo(srcimg);

L
LDOUVLEV 已提交
254 255
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
L
LDOUBLEV 已提交
256 257 258 259 260 261
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
262
  auto *data0 = input_tensor0->mutable_data<float>();
L
LDOUBLEV 已提交
263 264

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
L
LDOUVLEV 已提交
265
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
266
  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
L
LDOUBLEV 已提交
267
  NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);
L
LDOUBLEV 已提交
268 269 270 271 272

  // Run predictor
  predictor->Run();

  // Get output and post process
L
LDOUVLEV 已提交
273 274
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
275
  auto *outptr = output_tensor->data<float>();
L
LDOUBLEV 已提交
276 277 278
  auto shape_out = output_tensor->shape();

  // Save output
L
update  
LDOUBLEV 已提交
279 280
  float pred[shape_out[2] * shape_out[3]];
  unsigned char cbuf[shape_out[2] * shape_out[3]];
L
LDOUBLEV 已提交
281

L
LDOUVLEV 已提交
282
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
L
LDOUBLEV 已提交
283 284
    pred[i] = static_cast<float>(outptr[i]);
    cbuf[i] = static_cast<unsigned char>((outptr[i]) * 255);
L
LDOUBLEV 已提交
285 286
  }

L
LDOUBLEV 已提交
287
  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1,
L
LDOUBLEV 已提交
288
                   reinterpret_cast<unsigned char *>(cbuf));
L
LDOUBLEV 已提交
289
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F,
L
LDOUBLEV 已提交
290
                   reinterpret_cast<float *>(pred));
L
LDOUBLEV 已提交
291

L
LDOUVLEV 已提交
292
  const double threshold = double(Config["det_db_thresh"]) * 255;
L
LDOUBLEV 已提交
293 294 295
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);
L
LDOUBLEV 已提交
296
  cv::Mat dilation_map;
L
LDOUBLEV 已提交
297
  cv::Mat dila_ele = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
L
LDOUBLEV 已提交
298 299
  cv::dilate(bit_map, dilation_map, dila_ele);
  auto boxes = BoxesFromBitmap(pred_map, dilation_map, Config);
L
LDOUBLEV 已提交
300

L
LDOUVLEV 已提交
301 302
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
L
LDOUBLEV 已提交
303

L
LDOUVLEV 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
L
LDOUBLEV 已提交
321 322
      rook_points[n][m] = cv::Point(static_cast<int>(boxes[n][m][0]),
                                    static_cast<int>(boxes[n][m][1]));
L
LDOUBLEV 已提交
323 324 325 326
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
L
LDOUVLEV 已提交
327
  for (int n = 0; n < boxes.size(); n++) {
328
    const cv::Point *ppt[1] = {rook_points[n]};
L
LDOUVLEV 已提交
329 330
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
L
LDOUBLEV 已提交
331 332
  }

333 334
  cv::imwrite("./vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl;
L
LDOUVLEV 已提交
335 336 337
  return img_vis;
}

338 339
std::vector<std::string> split(const std::string &str,
                               const std::string &delim) {
L
LDOUVLEV 已提交
340
  std::vector<std::string> res;
341 342 343
  if ("" == str)
    return res;
  char *strs = new char[str.length() + 1];
L
LDOUVLEV 已提交
344 345
  std::strcpy(strs, str.c_str());

346
  char *d = new char[delim.length() + 1];
L
LDOUVLEV 已提交
347 348
  std::strcpy(d, delim.c_str());

349
  char *p = std::strtok(strs, d);
L
LDOUVLEV 已提交
350 351 352 353 354
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
L
LDOUBLEV 已提交
355

L
LDOUVLEV 已提交
356
  return res;
L
LDOUBLEV 已提交
357 358
}

L
LDOUVLEV 已提交
359 360 361 362 363 364 365 366 367 368
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
L
LDOUBLEV 已提交
369

370
int main(int argc, char **argv) {
L
LDOUVLEV 已提交
371 372
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
L
LDOUBLEV 已提交
373 374
              << " det_model_file cls_model_file rec_model_file image_path "
                 "charactor_dict\n";
L
LDOUBLEV 已提交
375 376 377 378
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
W
WenmuZhou 已提交
379 380 381
  std::string cls_model_file = argv[3];
  std::string img_path = argv[4];
  std::string dict_path = argv[5];
L
LDOUVLEV 已提交
382 383 384

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
L
LDOUBLEV 已提交
385
  int use_direction_classify = int(Config["use_direction_classify"]);
L
LDOUBLEV 已提交
386 387 388

  auto start = std::chrono::system_clock::now();

L
LDOUVLEV 已提交
389 390
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);
W
WenmuZhou 已提交
391
  auto cls_predictor = loadModel(cls_model_file);
L
LDOUVLEV 已提交
392

L
update  
LDOUBLEV 已提交
393
  auto charactor_dict = ReadDict(dict_path);
L
LDOUBLEV 已提交
394
  charactor_dict.push_back(" ");
L
update  
LDOUBLEV 已提交
395

L
LDOUBLEV 已提交
396
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
L
LDOUVLEV 已提交
397
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
L
LDOUBLEV 已提交
398

L
LDOUVLEV 已提交
399 400
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
L
LDOUBLEV 已提交
401

L
update  
LDOUBLEV 已提交
402
  RunRecModel(boxes, srcimg, rec_predictor, rec_text, rec_text_score,
L
LDOUBLEV 已提交
403
              charactor_dict, cls_predictor, use_direction_classify);
L
LDOUBLEV 已提交
404

L
LDOUVLEV 已提交
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
L
LDOUBLEV 已提交
422 423 424 425
            << "秒" << std::endl;

  return 0;
}