Text recognition crnn (#176)

* rcnn only for EN * review 1 * add readme.md * rename file * review 2 * review 3 * option v * review 4

Text recognition crnn (#176)
* rcnn only for EN * review 1 * add readme.md * rename file * review 2 * review 3 * option v * review 4
9aa1562f · Laurent Berger · GitHub · 0d619617 · 9aa1562f · 9aa1562f
4 changed file
--- a/models/text_recognition_crnn/CMakeLists.txt
+++ b/models/text_recognition_crnn/CMakeLists.txt
+cmake_minimum_required(VERSION 3.24)
+set(project_name "opencv_zoo_text_recognition_crnn")
+
+PROJECT (${project_name})
+
+set(OPENCV_VERSION "4.7.0")
+set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
+find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
+# Find OpenCV, you may need to set OpenCV_DIR variable
+# to the absolute path to the directory containing OpenCVConfig.cmake file
+# via the command line or GUI
+
+file(GLOB SourceFile
+    "demo.cpp")
+# If the package has been found, several variables will
+# be set, you can find the full list with descriptions
+# in the OpenCVConfig.cmake file.
+# Print some message showing some of them
+message(STATUS "OpenCV library status:")
+message(STATUS "    config: ${OpenCV_DIR}")
+message(STATUS "    version: ${OpenCV_VERSION}")
+message(STATUS "    libraries: ${OpenCV_LIBS}")
+message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+# Declare the executable target built from your sources
+add_executable(${project_name} ${SourceFile})
+
+# Link your application with OpenCV libraries
+target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
--- a/models/text_recognition_crnn/README.md
+++ b/models/text_recognition_crnn/README.md
@@ -36,6 +36,8 @@ Note:
  - Try `text_recognition_CRNN_CH_2021sep.onnx` with `charset_94_CH.txt`
  - Try `text_recognition_CRNN_CN_2021sep.onnx` with `charset_3944_CN.txt`.

+### Python
+
 Run the demo detecting English:

 ```shell
@@ -52,13 +54,38 @@ Run the demo detecting Chinese:

 ```shell
 # detect on camera input
-python demo.py --model text_recognition_CRNN_CN_2021nov.onnx --charset charset_3944_CN.txt
+python demo.py --model text_recognition_CRNN_CN_2021nov.onnx
 # detect on an image
-python demo.py --input /path/to/image --model text_recognition_CRNN_CN_2021nov.onnx --charset charset_3944_CN.txt
+python demo.py --input /path/to/image --model text_recognition_CRNN_CN_2021nov.onnx

 # get help regarding various parameters
 python demo.py --help
 ```
+### C++
+
+Install latest OpenCV and CMake >= 3.24.0 to get started with:
+
+```shell
+# detect on camera input
+./build/opencv_zoo_text_recognition_crnn 
+# detect on an image
+./build/opencv_zoo_text_recognition_crnn --input /path/to/image -v
+
+# get help regarding various parameters
+./build/opencv_zoo_text_recognition_crnn --help
+```
+
+Run the demo detecting Chinese:
+
+```shell
+# detect on camera input
+./build/opencv_zoo_text_recognition_crnn  --model=text_recognition_CRNN_CN_2021nov.onnx --charset=charset_3944_CN.txt
+# detect on an image
+./build/opencv_zoo_text_recognition_crnn  --input=/path/to/image --model=text_recognition_CRNN_CN_2021nov.onnx --charset=charset_3944_CN.txt
+
+# get help regarding various parameters
+./build/opencv_zoo_text_recognition_crnn  --help
+

 ### Examples


--- a/models/text_recognition_crnn/charset_32_94_3944.h
+++ b/models/text_recognition_crnn/charset_32_94_3944.h
--- a/models/text_recognition_crnn/demo.cpp
+++ b/models/text_recognition_crnn/demo.cpp
+#include <iostream>
+#include <codecvt>
+
+
+#include <opencv2/dnn.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include "charset_32_94_3944.h"
+
+using namespace std;
+using namespace cv;
+using namespace dnn;
+
+vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
+        std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
+        std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
+        std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
+        std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
+        std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
+        
+vector<u16string> loadCharset(string);
+
+std::string keys =
+"{ help  h           |                                              | Print help message. }"
+"{ model m           | text_recognition_CRNN_EN_2021sep.onnx        | Usage: Set model type, defaults to text_recognition_CRNN_EN_2021sep.onnx }"
+"{ input i           |                                              | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
+"{ width             | 736                                          | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
+"{ height            | 736                                          | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
+"{ binary_threshold  | 0.3                                          | Usage: Threshold of the binary map, default = 0.3.}"
+"{ polygon_threshold | 0.5                                          | Usage: Threshold of polygons, default = 0.5.}"
+"{ max_candidates    | 200                                          | Usage: Set maximum number of polygon candidates, default = 200.}"
+"{ unclip_ratio      | 2.0                                          | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
+"{ save s            | 1                                            | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
+"{ viz v             | 1                                            | Usage: Specify to open a new window to show results.}"
+"{ backend bt        | 0                                            | Choose one of computation backends: "
+"0: (default) OpenCV implementation + CPU, "
+"1: CUDA + GPU (CUDA), "
+"2: CUDA + GPU (CUDA FP16), "
+"3: TIM-VX + NPU, "
+"4: CANN + NPU}";
+
+
+class DB {
+public:
+
+    DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
+        float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
+        dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
+        polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
+        backendId(bId), targetId(tId)
+    {
+        this->model = TextDetectionModel_DB(readNet(modelPath));
+        this->model.setPreferableBackend(backendId);
+        this->model.setPreferableTarget(targetId);
+
+        this->model.setBinaryThreshold(binaryThreshold);
+        this->model.setPolygonThreshold(polygonThreshold);
+        this->model.setUnclipRatio(unclipRatio);
+        this->model.setMaxCandidates(maxCandidates);
+
+        this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
+    }
+    pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
+        CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
+        CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
+        vector<vector<Point>> pt;
+        vector<float> confidence;
+        this->model.detect(image, pt, confidence);
+        return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
+    }
+
+private:
+    string modelPath;
+    TextDetectionModel_DB model;
+    Size inputSize;
+    float binaryThreshold;
+    float polygonThreshold;
+    int maxCandidates;
+    double unclipRatio;
+    dnn::Backend backendId;
+    dnn::Target targetId;
+
+};
+
+
+
+class CRNN {
+private:
+    string modelPath;
+    dnn::Backend backendId;
+    dnn::Target targetId;
+    Net model;
+    vector<u16string> charset;
+    Size inputSize;
+    Mat targetVertices;
+
+public:
+    CRNN(string modPath, dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), backendId(bId), targetId(tId) {
+
+        this->model = readNet(this->modelPath);
+        this->model.setPreferableBackend(this->backendId);
+        this->model.setPreferableTarget(this->targetId);
+        // load charset by the name of model
+        if (this->modelPath.find("_EN_") != string::npos)
+            this->charset = loadCharset("CHARSET_EN_36");
+        else if (this->modelPath.find("_CH_") != string::npos)
+            this->charset = loadCharset("CHARSET_CH_94");
+        else if (this->modelPath.find("_CN_") != string::npos)
+            this->charset = loadCharset("CHARSET_CN_3944");
+        else
+            CV_Error(-1, "Charset not supported! Exiting ...");
+
+        this->inputSize = Size(100, 32); // Fixed
+        this->targetVertices = Mat(4, 1, CV_32FC2);
+        this->targetVertices.row(0) = Vec2f(0, this->inputSize.height - 1);
+        this->targetVertices.row(1) = Vec2f(0, 0);
+        this->targetVertices.row(2) = Vec2f(this->inputSize.width - 1, 0);
+        this->targetVertices.row(3) = Vec2f(this->inputSize.width - 1, this->inputSize.height - 1);
+    }
+
+    Mat preprocess(Mat image, Mat rbbox)
+	{
+        // Remove conf, reshape and ensure all is np.float32
+        Mat vertices;
+        rbbox.reshape(2, 4).convertTo(vertices, CV_32FC2);
+
+        Mat rotationMatrix = getPerspectiveTransform(vertices, this->targetVertices);
+        Mat cropped;
+        warpPerspective(image, cropped, rotationMatrix, this->inputSize);
+
+        // 'CN' can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters
+        // 'CH' can detect digits (0\~9), upper/lower-case le6tters (a\~z and A\~Z), some Chinese characters and some special characters
+        if (this->modelPath.find("CN") == string::npos && this->modelPath.find("CH") == string::npos)
+            cvtColor(cropped, cropped, COLOR_BGR2GRAY);
+        Mat blob = blobFromImage(cropped, 1 / 127.5, this->inputSize, Scalar::all(127.5));
+        return blob;
+    }
+
+    u16string infer(Mat image, Mat rbbox)
+	{
+        // Preprocess
+        Mat inputBlob = this->preprocess(image, rbbox);
+
+        //  Forward
+        this->model.setInput(inputBlob);
+        Mat outputBlob = this->model.forward();
+
+        // Postprocess
+        u16string results = this->postprocess(outputBlob);
+
+        return results;
+    }
+
+    u16string postprocess(Mat outputBlob)
+	{
+        // Decode charaters from outputBlob
+        Mat character = outputBlob.reshape(1, outputBlob.size[0]);
+        u16string    text(u"");
+        for (int i = 0; i < character.rows; i++)
+        {
+            double minVal, maxVal;
+            Point maxIdx;
+            minMaxLoc(character.row(i), &minVal, &maxVal, nullptr, &maxIdx);
+            if (maxIdx.x != 0)
+                text += charset[maxIdx.x - 1];
+            else
+                text += u"-";
+        }
+        // adjacent same letters as well as background text must be removed to get the final output
+        u16string    textFilter(u"");
+
+        for (int i = 0; i < text.size(); i++)
+            if (text[i] != u'-' && !(i > 0 && text[i] == text[i - 1]))
+                textFilter += text[i];
+        return textFilter;
+    }
+};
+
+
+Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
+{
+    Mat output;
+    image.copyTo(output);
+    if (fps > 0)
+        putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
+    polylines(output, results.first, isClosed, boxColor, thickness);
+    return output;
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineParser parser(argc, argv, keys);
+
+    parser.about("An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition (https://arxiv.org/abs/1507.05717)");
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    int backendTargetid = parser.get<int>("backend");
+    String modelPath = parser.get<String>("model");
+
+    if (modelPath.empty())
+    {
+        CV_Error(Error::StsError, "Model file " + modelPath + " not found");
+    }
+
+    Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
+    float binThresh = parser.get<float>("binary_threshold");
+    float polyThresh = parser.get<float>("polygon_threshold");
+    int maxCand = parser.get<int>("max_candidates");
+    double unRatio = parser.get<float>("unclip_ratio");
+    bool save = parser.get<bool>("save");
+    bool viz = parser.get<float>("viz");
+
+    DB detector("../text_detection_db/text_detection_DB_IC15_resnet18_2021sep.onnx", inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
+    CRNN recognizer(modelPath, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
+    //! [Open a video file or an image file or a camera stream]
+    VideoCapture cap;
+    if (parser.has("input"))
+        cap.open(parser.get<String>("input"));
+    else
+        cap.open(0);
+    if (!cap.isOpened())
+        CV_Error(Error::StsError, "Cannot opend video or file");
+    Mat originalImage;
+    static const std::string kWinName = modelPath;
+    while (waitKey(1) < 0)
+    {
+        cap >> originalImage;
+        if (originalImage.empty())
+        {
+            cout << "Frame is empty" << endl;
+            waitKey();
+            break;
+        }
+        int originalW = originalImage.cols;
+        int originalH = originalImage.rows;
+        double scaleHeight = originalH / double(inpSize.height);
+        double scaleWidth = originalW / double(inpSize.width);
+        Mat image;
+        resize(originalImage, image, inpSize);
+
+        // inference of text detector
+        TickMeter tm;
+        tm.start();
+        pair< vector<vector<Point>>, vector<float> > results = detector.infer(image);
+        tm.stop();
+        if (results.first.size() > 0 && results.second.size() > 0)
+        {
+            u16string texts;
+            auto score=results.second.begin();
+            for (auto box : results.first)
+            {
+                Mat result = Mat(box).reshape(2, 4);
+                texts = texts + u"'" + recognizer.infer(image, result) + u"'";
+            }
+            std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> converter;
+            std::cout << converter.to_bytes(texts) << std::endl;
+        }
+        auto x = results.first;
+        // Scale the results bounding box
+        for (auto &pts : results.first)
+        {
+            for (int i = 0; i < 4; i++)
+            {
+                pts[i].x = int(pts[i].x * scaleWidth);
+                pts[i].y = int(pts[i].y * scaleHeight);
+            }
+        }
+        originalImage = visualize(originalImage, results, tm.getFPS());
+        tm.reset();
+        if (parser.has("input"))
+        {
+            if (save)
+            {
+                cout << "Result image saved to result.jpg\n";
+                imwrite("result.jpg", originalImage);
+            }
+            if (viz)
+                imshow(kWinName, originalImage);
+        }
+        else
+            imshow(kWinName, originalImage);
+  
+    }
+    return 0;
+}