ocr_db_crnn.cc 11.6 KB
Newer Older
L
LDOUVLEV 已提交
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
L
LDOUBLEV 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15
#include "paddle_api.h" // NOLINT
L
LDOUBLEV 已提交
16 17
#include <chrono>

L
LDOUVLEV 已提交
18 19
#include "crnn_process.h"
#include "db_post_process.h"
L
LDOUBLEV 已提交
20

21
using namespace paddle::lite_api; // NOLINT
L
LDOUVLEV 已提交
22
using namespace std;
L
LDOUBLEV 已提交
23 24

// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
L
LDOUBLEV 已提交
25 26 27
void NeonMeanScale(const float *din, float *dout, int size,
                   const std::vector<float> mean,
                   const std::vector<float> scale) {
L
LDOUBLEV 已提交
28 29 30 31 32 33 34 35 36 37 38
  if (mean.size() != 3 || scale.size() != 3) {
    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
    exit(1);
  }
  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
  float32x4_t vscale0 = vdupq_n_f32(scale[0]);
  float32x4_t vscale1 = vdupq_n_f32(scale[1]);
  float32x4_t vscale2 = vdupq_n_f32(scale[2]);

39 40 41
  float *dout_c0 = dout;
  float *dout_c1 = dout + size;
  float *dout_c2 = dout + size * 2;
L
LDOUBLEV 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

  int i = 0;
  for (; i < size - 3; i += 4) {
    float32x4x3_t vin3 = vld3q_f32(din);
    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
    vst1q_f32(dout_c0, vs0);
    vst1q_f32(dout_c1, vs1);
    vst1q_f32(dout_c2, vs2);

    din += 12;
    dout_c0 += 4;
    dout_c1 += 4;
    dout_c2 += 4;
  }
  for (; i < size; i++) {
    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
    *(dout_c1++) = (*(din++) - mean[1]) * scale[1];
    *(dout_c2++) = (*(din++) - mean[2]) * scale[2];
  }
}

// resize image to a size multiple of 32 which is required by the network
69 70
cv::Mat DetResizeImg(const cv::Mat img, int max_size_len,
                     std::vector<float> &ratio_hw) {
L
LDOUBLEV 已提交
71 72 73 74
  int w = img.cols;
  int h = img.rows;

  float ratio = 1.f;
L
LDOUVLEV 已提交
75 76 77
  int max_wh = w >= h ? w : h;
  if (max_wh > max_size_len) {
    if (h > w) {
L
LDOUBLEV 已提交
78
      ratio = static_cast<float>(max_size_len) / static_cast<float>(h);
L
LDOUBLEV 已提交
79
    } else {
L
LDOUBLEV 已提交
80
      ratio = static_cast<float>(max_size_len) / static_cast<float>(w);
L
LDOUBLEV 已提交
81 82 83
    }
  }

L
LDOUBLEV 已提交
84 85
  int resize_h = static_cast<int>(float(h) * ratio);
  int resize_w = static_cast<int>(float(w) * ratio);
L
LDOUBLEV 已提交
86 87
  if (resize_h % 32 == 0)
    resize_h = resize_h;
L
LDOUVLEV 已提交
88
  else if (resize_h / 32 < 1 + 1e-5)
L
LDOUBLEV 已提交
89 90 91 92 93 94
    resize_h = 32;
  else
    resize_h = (resize_h / 32 - 1) * 32;

  if (resize_w % 32 == 0)
    resize_w = resize_w;
L
LDOUVLEV 已提交
95
  else if (resize_w / 32 < 1 + 1e-5)
L
LDOUBLEV 已提交
96 97
    resize_w = 32;
  else
L
LDOUVLEV 已提交
98
    resize_w = (resize_w / 32 - 1) * 32;
L
LDOUBLEV 已提交
99 100 101 102

  cv::Mat resize_img;
  cv::resize(img, resize_img, cv::Size(resize_w, resize_h));

L
LDOUBLEV 已提交
103 104
  ratio_hw.push_back(static_cast<float>(resize_h) / static_cast<float>(h));
  ratio_hw.push_back(static_cast<float>(resize_w) / static_cast<float>(w));
L
LDOUBLEV 已提交
105 106 107
  return resize_img;
}

108
void RunRecModel(std::vector<std::vector<std::vector<int>>> boxes, cv::Mat img,
L
LDOUVLEV 已提交
109
                 std::shared_ptr<PaddlePredictor> predictor_crnn,
L
update  
LDOUBLEV 已提交
110 111 112
                 std::vector<std::string> &rec_text,
                 std::vector<float> &rec_text_score,
                 std::vector<std::string> charactor_dict) {
L
LDOUBLEV 已提交
113 114 115 116 117 118 119 120 121
  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
  std::vector<float> scale = {1 / 0.5f, 1 / 0.5f, 1 / 0.5f};

  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;

  int index = 0;
L
LDOUVLEV 已提交
122 123
  for (int i = boxes.size() - 1; i >= 0; i--) {
    crop_img = GetRotateCropImage(srcimg, boxes[i]);
L
LDOUBLEV 已提交
124 125
    float wh_ratio =
        static_cast<float>(crop_img.cols) / static_cast<float>(crop_img.rows);
L
LDOUBLEV 已提交
126

127
    resize_img = CrnnResizeImg(crop_img, wh_ratio);
L
LDOUBLEV 已提交
128 129
    resize_img.convertTo(resize_img, CV_32FC3, 1 / 255.f);

130
    const float *dimg = reinterpret_cast<const float *>(resize_img.data);
L
LDOUBLEV 已提交
131

L
LDOUVLEV 已提交
132 133
    std::unique_ptr<Tensor> input_tensor0(
        std::move(predictor_crnn->GetInput(0)));
L
LDOUBLEV 已提交
134
    input_tensor0->Resize({1, 3, resize_img.rows, resize_img.cols});
135
    auto *data0 = input_tensor0->mutable_data<float>();
L
LDOUBLEV 已提交
136

L
LDOUBLEV 已提交
137
    NeonMeanScale(dimg, data0, resize_img.rows * resize_img.cols, mean, scale);
L
LDOUBLEV 已提交
138 139 140 141 142
    //// Run CRNN predictor
    predictor_crnn->Run();

    // Get output and run postprocess
    std::unique_ptr<const Tensor> output_tensor0(
L
LDOUVLEV 已提交
143
        std::move(predictor_crnn->GetOutput(0)));
L
LDOUBLEV 已提交
144
    auto *rec_idx = output_tensor0->data<int64>();
L
LDOUBLEV 已提交
145 146 147 148 149

    auto rec_idx_lod = output_tensor0->lod();
    auto shape_out = output_tensor0->shape();

    std::vector<int> pred_idx;
L
LDOUBLEV 已提交
150 151 152
    for (int n = static_cast<int>(rec_idx_lod[0][0]);
         n < static_cast<int>(rec_idx_lod[0][1]); n += 1) {
      pred_idx.push_back(static_cast<int>(rec_idx[n]));
L
LDOUBLEV 已提交
153 154
    }

155 156
    if (pred_idx.size() < 1e-3)
      continue;
L
LDOUBLEV 已提交
157 158

    index += 1;
L
LDOUVLEV 已提交
159
    std::string pred_txt = "";
L
LDOUBLEV 已提交
160
    for (int n = 0; n < pred_idx.size(); n++) {
L
LDOUVLEV 已提交
161
      pred_txt += charactor_dict[pred_idx[n]];
L
LDOUBLEV 已提交
162
    }
L
LDOUVLEV 已提交
163
    rec_text.push_back(pred_txt);
L
LDOUBLEV 已提交
164 165

    ////get score
L
LDOUVLEV 已提交
166 167
    std::unique_ptr<const Tensor> output_tensor1(
        std::move(predictor_crnn->GetOutput(1)));
168
    auto *predict_batch = output_tensor1->data<float>();
L
LDOUBLEV 已提交
169 170 171 172 173 174 175 176 177
    auto predict_shape = output_tensor1->shape();

    auto predict_lod = output_tensor1->lod();

    int blank = predict_shape[1];
    float score = 0.f;
    int count = 0;

    for (int n = predict_lod[0][0]; n < predict_lod[0][1] - 1; n++) {
L
LDOUBLEV 已提交
178 179 180 181
      int argmax_idx =
          static_cast<int>(Argmax(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
      float max_value =
L
LDOUVLEV 已提交
182 183
          float(*std::max_element(&predict_batch[n * predict_shape[1]],
                                  &predict_batch[(n + 1) * predict_shape[1]]));
L
LDOUBLEV 已提交
184 185 186 187 188 189 190

      if (blank - 1 - argmax_idx > 1e-5) {
        score += max_value;
        count += 1;
      }
    }
    score /= count;
L
LDOUVLEV 已提交
191
    rec_text_score.push_back(score);
L
LDOUBLEV 已提交
192 193 194
  }
}

195 196 197
std::vector<std::vector<std::vector<int>>>
RunDetModel(std::shared_ptr<PaddlePredictor> predictor, cv::Mat img,
            std::map<std::string, double> Config) {
L
LDOUBLEV 已提交
198
  // Read img
L
LDOUVLEV 已提交
199
  int max_side_len = int(Config["max_side_len"]);
L
LDOUBLEV 已提交
200 201 202 203

  cv::Mat srcimg;
  img.copyTo(srcimg);

L
LDOUVLEV 已提交
204 205
  std::vector<float> ratio_hw;
  img = DetResizeImg(img, max_side_len, ratio_hw);
L
LDOUBLEV 已提交
206 207 208 209 210 211
  cv::Mat img_fp;
  img.convertTo(img_fp, CV_32FC3, 1.0 / 255.f);

  // Prepare input data from image
  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
  input_tensor0->Resize({1, 3, img_fp.rows, img_fp.cols});
212
  auto *data0 = input_tensor0->mutable_data<float>();
L
LDOUBLEV 已提交
213 214

  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
L
LDOUVLEV 已提交
215
  std::vector<float> scale = {1 / 0.229f, 1 / 0.224f, 1 / 0.225f};
216
  const float *dimg = reinterpret_cast<const float *>(img_fp.data);
L
LDOUBLEV 已提交
217
  NeonMeanScale(dimg, data0, img_fp.rows * img_fp.cols, mean, scale);
L
LDOUBLEV 已提交
218 219 220 221 222

  // Run predictor
  predictor->Run();

  // Get output and post process
L
LDOUVLEV 已提交
223 224
  std::unique_ptr<const Tensor> output_tensor(
      std::move(predictor->GetOutput(0)));
225
  auto *outptr = output_tensor->data<float>();
L
LDOUBLEV 已提交
226 227 228
  auto shape_out = output_tensor->shape();

  // Save output
L
update  
LDOUBLEV 已提交
229 230
  float pred[shape_out[2] * shape_out[3]];
  unsigned char cbuf[shape_out[2] * shape_out[3]];
L
LDOUBLEV 已提交
231

L
LDOUVLEV 已提交
232
  for (int i = 0; i < int(shape_out[2] * shape_out[3]); i++) {
L
LDOUBLEV 已提交
233 234
    pred[i] = static_cast<float>(outptr[i]);
    cbuf[i] = static_cast<unsigned char>((outptr[i]) * 255);
L
LDOUBLEV 已提交
235 236
  }

L
LDOUBLEV 已提交
237
  cv::Mat cbuf_map(shape_out[2], shape_out[3], CV_8UC1,
L
LDOUBLEV 已提交
238
                   reinterpret_cast<unsigned char *>(cbuf));
L
LDOUBLEV 已提交
239
  cv::Mat pred_map(shape_out[2], shape_out[3], CV_32F,
L
LDOUBLEV 已提交
240
                   reinterpret_cast<float *>(pred));
L
LDOUBLEV 已提交
241

L
LDOUVLEV 已提交
242
  const double threshold = double(Config["det_db_thresh"]) * 255;
L
LDOUBLEV 已提交
243 244 245 246
  const double maxvalue = 255;
  cv::Mat bit_map;
  cv::threshold(cbuf_map, bit_map, threshold, maxvalue, cv::THRESH_BINARY);

L
LDOUVLEV 已提交
247
  auto boxes = BoxesFromBitmap(pred_map, bit_map, Config);
L
LDOUBLEV 已提交
248

L
LDOUVLEV 已提交
249 250
  std::vector<std::vector<std::vector<int>>> filter_boxes =
      FilterTagDetRes(boxes, ratio_hw[0], ratio_hw[1], srcimg);
L
LDOUBLEV 已提交
251

L
LDOUVLEV 已提交
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
  return filter_boxes;
}

std::shared_ptr<PaddlePredictor> loadModel(std::string model_file) {
  MobileConfig config;
  config.set_model_from_file(model_file);

  std::shared_ptr<PaddlePredictor> predictor =
      CreatePaddlePredictor<MobileConfig>(config);
  return predictor;
}

cv::Mat Visualization(cv::Mat srcimg,
                      std::vector<std::vector<std::vector<int>>> boxes) {
  cv::Point rook_points[boxes.size()][4];
  for (int n = 0; n < boxes.size(); n++) {
    for (int m = 0; m < boxes[0].size(); m++) {
L
LDOUBLEV 已提交
269 270
      rook_points[n][m] = cv::Point(static_cast<int>(boxes[n][m][0]),
                                    static_cast<int>(boxes[n][m][1]));
L
LDOUBLEV 已提交
271 272 273 274
    }
  }
  cv::Mat img_vis;
  srcimg.copyTo(img_vis);
L
LDOUVLEV 已提交
275
  for (int n = 0; n < boxes.size(); n++) {
276
    const cv::Point *ppt[1] = {rook_points[n]};
L
LDOUVLEV 已提交
277 278
    int npt[] = {4};
    cv::polylines(img_vis, ppt, npt, 1, 1, CV_RGB(0, 255, 0), 2, 8, 0);
L
LDOUBLEV 已提交
279 280
  }

281 282
  cv::imwrite("./vis.jpg", img_vis);
  std::cout << "The detection visualized image saved in ./vis.jpg" << std::endl;
L
LDOUVLEV 已提交
283 284 285
  return img_vis;
}

286 287
std::vector<std::string> split(const std::string &str,
                               const std::string &delim) {
L
LDOUVLEV 已提交
288
  std::vector<std::string> res;
289 290 291
  if ("" == str)
    return res;
  char *strs = new char[str.length() + 1];
L
LDOUVLEV 已提交
292 293
  std::strcpy(strs, str.c_str());

294
  char *d = new char[delim.length() + 1];
L
LDOUVLEV 已提交
295 296
  std::strcpy(d, delim.c_str());

297
  char *p = std::strtok(strs, d);
L
LDOUVLEV 已提交
298 299 300 301 302
  while (p) {
    string s = p;
    res.push_back(s);
    p = std::strtok(NULL, d);
  }
L
LDOUBLEV 已提交
303

L
LDOUVLEV 已提交
304
  return res;
L
LDOUBLEV 已提交
305 306
}

L
LDOUVLEV 已提交
307 308 309 310 311 312 313 314 315 316
std::map<std::string, double> LoadConfigTxt(std::string config_path) {
  auto config = ReadDict(config_path);

  std::map<std::string, double> dict;
  for (int i = 0; i < config.size(); i++) {
    std::vector<std::string> res = split(config[i], " ");
    dict[res[0]] = stod(res[1]);
  }
  return dict;
}
L
LDOUBLEV 已提交
317

318
int main(int argc, char **argv) {
L
LDOUVLEV 已提交
319 320 321
  if (argc < 5) {
    std::cerr << "[ERROR] usage: " << argv[0]
              << " det_model_file rec_model_file image_path\n";
L
LDOUBLEV 已提交
322 323 324 325 326
    exit(1);
  }
  std::string det_model_file = argv[1];
  std::string rec_model_file = argv[2];
  std::string img_path = argv[3];
L
LDOUVLEV 已提交
327 328 329 330
  std::string dict_path = argv[4];

  //// load config from txt file
  auto Config = LoadConfigTxt("./config.txt");
L
LDOUBLEV 已提交
331 332 333

  auto start = std::chrono::system_clock::now();

L
LDOUVLEV 已提交
334 335 336
  auto det_predictor = loadModel(det_model_file);
  auto rec_predictor = loadModel(rec_model_file);

L
update  
LDOUBLEV 已提交
337
  auto charactor_dict = ReadDict(dict_path);
L
LDOUBLEV 已提交
338
  charactor_dict.push_back(" ");
L
update  
LDOUBLEV 已提交
339

L
LDOUBLEV 已提交
340
  cv::Mat srcimg = cv::imread(img_path, cv::IMREAD_COLOR);
L
LDOUVLEV 已提交
341
  auto boxes = RunDetModel(det_predictor, srcimg, Config);
L
LDOUBLEV 已提交
342

L
LDOUVLEV 已提交
343 344
  std::vector<std::string> rec_text;
  std::vector<float> rec_text_score;
L
update  
LDOUBLEV 已提交
345 346
  RunRecModel(boxes, srcimg, rec_predictor, rec_text, rec_text_score,
              charactor_dict);
L
LDOUBLEV 已提交
347

L
LDOUVLEV 已提交
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
  auto end = std::chrono::system_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(end - start);

  //// visualization
  auto img_vis = Visualization(srcimg, boxes);

  //// print recognized text
  for (int i = 0; i < rec_text.size(); i++) {
    std::cout << i << "\t" << rec_text[i] << "\t" << rec_text_score[i]
              << std::endl;
  }

  std::cout << "花费了"
            << double(duration.count()) *
                   std::chrono::microseconds::period::num /
                   std::chrono::microseconds::period::den
L
LDOUBLEV 已提交
365 366 367 368
            << "秒" << std::endl;

  return 0;
}