ocr_rec.cpp

// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <include/ocr_rec.h>

template <typename T>
vector<int> argsort(const std::vector<T>& array)
{
    const int array_len(array.size());
    std::vector<int> array_index(array_len, 0);
    for (int i = 0; i < array_len; ++i)
        array_index[i] = i;

    std::sort(array_index.begin(), array_index.end(),
        [&array](int pos1, int pos2) {return (array[pos1] < array[pos2]); });

    return array_index;
}

namespace PaddleOCR {

void CRNNRecognizer::Run(std::vector<std::vector<std::vector<int>>> boxes,
                         cv::Mat &img, Classifier *cls) {
  cv::Mat srcimg;
  img.copyTo(srcimg);
  cv::Mat crop_img;
  cv::Mat resize_img;
  std::vector<float> width_list;
  std::vector<cv::Mat> img_list;

  for (int i = boxes.size() - 1; i >= 0; i--) {
      crop_img = GetRotateCropImage(srcimg, boxes[i]);
      if (cls != nullptr) {
          crop_img = cls->Run(crop_img);
      }
      img_list.push_back(crop_img);
      float wh_ratio = float(crop_img.cols) / float(crop_img.rows);
      width_list.push_back(wh_ratio);
  }
  //sort box
  vector<int> sort_index = argsort(width_list);
  int batch_num1 = this->rec_batch_num_;//batchsize
  std::cout << "The predicted text is :" << std::endl;
  int index = 0;
  int beg_img_no = 0;
  int end_img_no = 0;
  for (int beg_img_no = 0; beg_img_no < img_list.size(); beg_img_no += batch_num1)
  {
    float max_wh_ratio = 0;
    end_img_no = min((int)boxes.size(), beg_img_no + batch_num1);
    int batch_num = min(end_img_no - beg_img_no, batch_num1);
    max_wh_ratio = width_list[sort_index[end_img_no - 1]];
    int imgW1 = int(32 * max_wh_ratio);
    int nqu, nra;
    nqu = imgW1 / 4;
    nra = imgW1 % 4;
    int imgW = imgW1;
    if (nra > 0)
    {
        imgW = int(4 * (nqu + 1));
    }
    std::vector<float> input(batch_num * 3 * 32 * imgW, 0.0f);//batchsize input
    for (int i = beg_img_no; i < end_img_no; i++)
    {
        crop_img = img_list[sort_index[i]];
        this->resize_op_.Run(crop_img, resize_img, max_wh_ratio);//resize
        this->normalize_op_.Run(&resize_img, this->mean_, this->scale_,
            this->is_scale_);

        cv::Mat padding_im;
        cv::copyMakeBorder(resize_img, padding_im, 0, 0, 0, int(imgW - resize_img.cols), cv::BORDER_CONSTANT, { 0, 0, 0 });//padding image

        this->permute_op_.Run(&padding_im, input.data() + (i - beg_img_no) * 3 * padding_im.rows * padding_im.cols);
    }
    auto input_names = this->predictor_->GetInputNames();
    auto input_t = this->predictor_->GetInputTensor(input_names[0]);
    input_t->Reshape({ batch_num, 3, 32, imgW });
    input_t->copy_from_cpu(input.data());

    this->predictor_->ZeroCopyRun();

    std::vector<int64_t> rec_idx;
    auto output_names = this->predictor_->GetOutputNames();
    auto output_t = this->predictor_->GetOutputTensor(output_names[0]);
    auto rec_idx_lod = output_t->lod()[0];

    std::vector<int> output_shape = output_t->shape();
    int out_num = 1;
    for (int i = 0; i < output_shape.size(); ++i) {
        out_num *= output_shape[i];
    }
    rec_idx.resize(out_num);
    output_t->copy_to_cpu(rec_idx.data());//output data

    std::vector<float> predict_batch;
    auto output_t_1 = this->predictor_->GetOutputTensor(output_names[1]);

    auto predict_lod = output_t_1->lod()[0];
    auto predict_shape = output_t_1->shape();

    int out_num_1 = 1;
    for (int i = 0; i < predict_shape.size(); ++i) {
        out_num_1 *= predict_shape[i];
    }

    predict_batch.resize(out_num_1);
    output_t_1->copy_to_cpu(predict_batch.data());

    int argmax_idx;
    int blank = predict_shape[1];

    for (int j = 0; j < rec_idx_lod.size() - 1; j++)
    {
        std::vector<int> pred_idx;
        float score = 0.f;
        int count = 0;
        float max_value = 0.0f;
        for (int n = int(rec_idx_lod[j]); n < int(rec_idx_lod[j + 1]); n++) {
            pred_idx.push_back(int(rec_idx[n]));
        }
        if (pred_idx.size() < 1e-3)
            continue;

        index += 1;
        std::cout << index << "\t";
        for (int n = 0; n < pred_idx.size(); n++) {
            std::cout << label_list_[pred_idx[n]];
        }

        for (int n = predict_lod[j]; n < predict_lod[j + 1] - 1; n++) {
            argmax_idx =
                int(Utility::argmax(&predict_batch[n * predict_shape[1]],
                    &predict_batch[(n + 1) * predict_shape[1]]));

            max_value = predict_batch[n * predict_shape[1] + argmax_idx];
            if (blank - 1 - argmax_idx > 1e-5) {
                score += max_value;
                count += 1;
            }
        }
        score /= count;
        std::cout << "\tscore: " << score << std::endl;
    }
  }
}

void CRNNRecognizer::LoadModel(const std::string &model_dir) {
  AnalysisConfig config;
  config.SetModel(model_dir + "/model", model_dir + "/params");

  if (this->use_gpu_) {
    config.EnableUseGpu(this->gpu_mem_, this->gpu_id_);
  } else {
    config.DisableGpu();
    if (this->use_mkldnn_) {
      config.EnableMKLDNN();
      // cache 10 different shapes for mkldnn to avoid memory leak
      config.SetMkldnnCacheCapacity(10);
    }
    config.SetCpuMathLibraryNumThreads(this->cpu_math_library_num_threads_);
  }

  // false for zero copy tensor
  // true for commom tensor
  config.SwitchUseFeedFetchOps(!this->use_zero_copy_run_);
  // true for multiple input
  config.SwitchSpecifyInputNames(true);

  config.SwitchIrOptim(true);

  config.EnableMemoryOptim();
  config.DisableGlogInfo();

  this->predictor_ = CreatePaddlePredictor(config);
}

cv::Mat CRNNRecognizer::GetRotateCropImage(const cv::Mat &srcimage,
                                           std::vector<std::vector<int>> box) {
  cv::Mat image;
  srcimage.copyTo(image);
  std::vector<std::vector<int>> points = box;

  int x_collect[4] = {box[0][0], box[1][0], box[2][0], box[3][0]};
  int y_collect[4] = {box[0][1], box[1][1], box[2][1], box[3][1]};
  int left = int(*std::min_element(x_collect, x_collect + 4));
  int right = int(*std::max_element(x_collect, x_collect + 4));
  int top = int(*std::min_element(y_collect, y_collect + 4));
  int bottom = int(*std::max_element(y_collect, y_collect + 4));

  cv::Mat img_crop;
  image(cv::Rect(left, top, right - left, bottom - top)).copyTo(img_crop);

  for (int i = 0; i < points.size(); i++) {
    points[i][0] -= left;
    points[i][1] -= top;
  }

  int img_crop_width = int(sqrt(pow(points[0][0] - points[1][0], 2) +
                                pow(points[0][1] - points[1][1], 2)));
  int img_crop_height = int(sqrt(pow(points[0][0] - points[3][0], 2) +
                                 pow(points[0][1] - points[3][1], 2)));

  cv::Point2f pts_std[4];
  pts_std[0] = cv::Point2f(0., 0.);
  pts_std[1] = cv::Point2f(img_crop_width, 0.);
  pts_std[2] = cv::Point2f(img_crop_width, img_crop_height);
  pts_std[3] = cv::Point2f(0.f, img_crop_height);

  cv::Point2f pointsf[4];
  pointsf[0] = cv::Point2f(points[0][0], points[0][1]);
  pointsf[1] = cv::Point2f(points[1][0], points[1][1]);
  pointsf[2] = cv::Point2f(points[2][0], points[2][1]);
  pointsf[3] = cv::Point2f(points[3][0], points[3][1]);

  cv::Mat M = cv::getPerspectiveTransform(pointsf, pts_std);

  cv::Mat dst_img;
  cv::warpPerspective(img_crop, dst_img, M,
                      cv::Size(img_crop_width, img_crop_height),
                      cv::BORDER_REPLICATE);

  if (float(dst_img.rows) >= float(dst_img.cols) * 1.5) {
    cv::Mat srcCopy = cv::Mat(dst_img.rows, dst_img.cols, dst_img.depth());
    cv::transpose(dst_img, srcCopy);
    cv::flip(srcCopy, srcCopy, 0);
    return srcCopy;
  } else {
    return dst_img;
  }
}

} // namespace PaddleOCR