cmake_minimum_required(VERSION 3.24)
set(project_name "opencv_zoo_text_recognition_crnn")
PROJECT (${project_name})
set(OPENCV_VERSION "4.7.0")
set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
# Find OpenCV, you may need to set OpenCV_DIR variable
# to the absolute path to the directory containing OpenCVConfig.cmake file
# via the command line or GUI
file(GLOB SourceFile
# If the package has been found, several variables will
# be set, you can find the full list with descriptions
# in the OpenCVConfig.cmake file.
# Print some message showing some of them
message(STATUS "OpenCV library status:")
message(STATUS " config: ${OpenCV_DIR}")
message(STATUS " version: ${OpenCV_VERSION}")
message(STATUS " libraries: ${OpenCV_LIBS}")
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
# Declare the executable target built from your sources
add_executable(${project_name} ${SourceFile})
# Link your application with OpenCV libraries
target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
......@@ -36,6 +36,8 @@ Note:
- Try `text_recognition_CRNN_CH_2021sep.onnx` with `charset_94_CH.txt`
- Try `text_recognition_CRNN_CN_2021sep.onnx` with `charset_3944_CN.txt`.
### Python
Run the demo detecting English:
......@@ -52,13 +54,38 @@ Run the demo detecting Chinese:
# detect on camera input
python demo.py --model text_recognition_CRNN_CN_2021nov.onnx --charset charset_3944_CN.txt
python demo.py --model text_recognition_CRNN_CN_2021nov.onnx
# detect on an image
python demo.py --input /path/to/image --model text_recognition_CRNN_CN_2021nov.onnx --charset charset_3944_CN.txt
python demo.py --input /path/to/image --model text_recognition_CRNN_CN_2021nov.onnx
# get help regarding various parameters
python demo.py --help
### C++
Install latest OpenCV and CMake >= 3.24.0 to get started with:
# detect on camera input
# detect on an image
./build/opencv_zoo_text_recognition_crnn --input /path/to/image -v
# get help regarding various parameters
./build/opencv_zoo_text_recognition_crnn --help
Run the demo detecting Chinese:
# detect on camera input
./build/opencv_zoo_text_recognition_crnn --model=text_recognition_CRNN_CN_2021nov.onnx --charset=charset_3944_CN.txt
# detect on an image
./build/opencv_zoo_text_recognition_crnn --input=/path/to/image --model=text_recognition_CRNN_CN_2021nov.onnx --charset=charset_3944_CN.txt
# get help regarding various parameters
./build/opencv_zoo_text_recognition_crnn --help
### Examples
#include <iostream>
#include <codecvt>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include "charset_32_94_3944.h"
using namespace std;
using namespace cv;
using namespace dnn;
vector< pair<cv::dnn::Backend, cv::dnn::Target> > backendTargetPairs = {
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_OPENCV, dnn::DNN_TARGET_CPU),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CUDA, dnn::DNN_TARGET_CUDA_FP16),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_TIMVX, dnn::DNN_TARGET_NPU),
std::make_pair<cv::dnn::Backend, cv::dnn::Target>(dnn::DNN_BACKEND_CANN, dnn::DNN_TARGET_NPU)};
vector<u16string> loadCharset(string);
std::string keys =
"{ help h | | Print help message. }"
"{ model m | text_recognition_CRNN_EN_2021sep.onnx | Usage: Set model type, defaults to text_recognition_CRNN_EN_2021sep.onnx }"
"{ input i | | Usage: Path to input image or video file. Skip this argument to capture frames from a camera.}"
"{ width | 736 | Usage: Resize input image to certain width, default = 736. It should be multiple by 32.}"
"{ height | 736 | Usage: Resize input image to certain height, default = 736. It should be multiple by 32.}"
"{ binary_threshold | 0.3 | Usage: Threshold of the binary map, default = 0.3.}"
"{ polygon_threshold | 0.5 | Usage: Threshold of polygons, default = 0.5.}"
"{ max_candidates | 200 | Usage: Set maximum number of polygon candidates, default = 200.}"
"{ unclip_ratio | 2.0 | Usage: The unclip ratio of the detected text region, which determines the output size, default = 2.0.}"
"{ save s | 1 | Usage: Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.}"
"{ viz v | 1 | Usage: Specify to open a new window to show results.}"
"{ backend bt | 0 | Choose one of computation backends: "
"0: (default) OpenCV implementation + CPU, "
"1: CUDA + GPU (CUDA), "
"2: CUDA + GPU (CUDA FP16), "
"3: TIM-VX + NPU, "
"4: CANN + NPU}";
class DB {
DB(string modPath, Size inSize = Size(736, 736), float binThresh = 0.3,
float polyThresh = 0.5, int maxCand = 200, double unRatio = 2.0,
dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), inputSize(inSize), binaryThreshold(binThresh),
polygonThreshold(polyThresh), maxCandidates(maxCand), unclipRatio(unRatio),
backendId(bId), targetId(tId)
this->model = TextDetectionModel_DB(readNet(modelPath));
this->model.setInputParams(1.0 / 255.0, inputSize, Scalar(122.67891434, 116.66876762, 104.00698793));
pair< vector<vector<Point>>, vector<float> > infer(Mat image) {
CV_Assert(image.rows == this->inputSize.height && "height of input image != net input size ");
CV_Assert(image.cols == this->inputSize.width && "width of input image != net input size ");
vector<vector<Point>> pt;
vector<float> confidence;
this->model.detect(image, pt, confidence);
return make_pair< vector<vector<Point>> &, vector< float > &>(pt, confidence);
string modelPath;
TextDetectionModel_DB model;
Size inputSize;
float binaryThreshold;
float polygonThreshold;
int maxCandidates;
double unclipRatio;
dnn::Backend backendId;
dnn::Target targetId;
class CRNN {
string modelPath;
dnn::Backend backendId;
dnn::Target targetId;
Net model;
vector<u16string> charset;
Size inputSize;
Mat targetVertices;
CRNN(string modPath, dnn::Backend bId = DNN_BACKEND_DEFAULT, dnn::Target tId = DNN_TARGET_CPU) : modelPath(modPath), backendId(bId), targetId(tId) {
this->model = readNet(this->modelPath);
// load charset by the name of model
if (this->modelPath.find("_EN_") != string::npos)
this->charset = loadCharset("CHARSET_EN_36");
else if (this->modelPath.find("_CH_") != string::npos)
this->charset = loadCharset("CHARSET_CH_94");
else if (this->modelPath.find("_CN_") != string::npos)
this->charset = loadCharset("CHARSET_CN_3944");
CV_Error(-1, "Charset not supported! Exiting ...");
this->inputSize = Size(100, 32); // Fixed
this->targetVertices = Mat(4, 1, CV_32FC2);
this->targetVertices.row(0) = Vec2f(0, this->inputSize.height - 1);
this->targetVertices.row(1) = Vec2f(0, 0);
this->targetVertices.row(2) = Vec2f(this->inputSize.width - 1, 0);
this->targetVertices.row(3) = Vec2f(this->inputSize.width - 1, this->inputSize.height - 1);
Mat preprocess(Mat image, Mat rbbox)
// Remove conf, reshape and ensure all is np.float32
Mat vertices;
rbbox.reshape(2, 4).convertTo(vertices, CV_32FC2);
Mat rotationMatrix = getPerspectiveTransform(vertices, this->targetVertices);
Mat cropped;
warpPerspective(image, cropped, rotationMatrix, this->inputSize);
// 'CN' can detect digits (0\~9), upper/lower-case letters (a\~z and A\~Z), and some special characters
// 'CH' can detect digits (0\~9), upper/lower-case le6tters (a\~z and A\~Z), some Chinese characters and some special characters
if (this->modelPath.find("CN") == string::npos && this->modelPath.find("CH") == string::npos)
cvtColor(cropped, cropped, COLOR_BGR2GRAY);
Mat blob = blobFromImage(cropped, 1 / 127.5, this->inputSize, Scalar::all(127.5));
return blob;
u16string infer(Mat image, Mat rbbox)
// Preprocess
Mat inputBlob = this->preprocess(image, rbbox);
// Forward
Mat outputBlob = this->model.forward();
// Postprocess
u16string results = this->postprocess(outputBlob);
return results;
u16string postprocess(Mat outputBlob)
// Decode charaters from outputBlob
Mat character = outputBlob.reshape(1, outputBlob.size[0]);
u16string text(u"");
for (int i = 0; i < character.rows; i++)
double minVal, maxVal;
Point maxIdx;
minMaxLoc(character.row(i), &minVal, &maxVal, nullptr, &maxIdx);
if (maxIdx.x != 0)
text += charset[maxIdx.x - 1];
text += u"-";
// adjacent same letters as well as background text must be removed to get the final output
u16string textFilter(u"");
for (int i = 0; i < text.size(); i++)
if (text[i] != u'-' && !(i > 0 && text[i] == text[i - 1]))
textFilter += text[i];
return textFilter;
Mat visualize(Mat image, pair< vector<vector<Point>>, vector<float> >&results, double fps=-1, Scalar boxColor=Scalar(0, 255, 0), Scalar textColor=Scalar(0, 0, 255), bool isClosed=true, int thickness=2)
Mat output;
if (fps > 0)
putText(output, format("FPS: %.2f", fps), Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor);
polylines(output, results.first, isClosed, boxColor, thickness);
return output;
int main(int argc, char** argv)
CommandLineParser parser(argc, argv, keys);
parser.about("An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition (https://arxiv.org/abs/1507.05717)");
if (parser.has("help"))
return 0;
int backendTargetid = parser.get<int>("backend");
String modelPath = parser.get<String>("model");
if (modelPath.empty())
CV_Error(Error::StsError, "Model file " + modelPath + " not found");
Size inpSize(parser.get<int>("width"), parser.get<int>("height"));
float binThresh = parser.get<float>("binary_threshold");
float polyThresh = parser.get<float>("polygon_threshold");
int maxCand = parser.get<int>("max_candidates");
double unRatio = parser.get<float>("unclip_ratio");
bool save = parser.get<bool>("save");
bool viz = parser.get<float>("viz");
DB detector("../text_detection_db/text_detection_DB_IC15_resnet18_2021sep.onnx", inpSize, binThresh, polyThresh, maxCand, unRatio, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
CRNN recognizer(modelPath, backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
//! [Open a video file or an image file or a camera stream]
VideoCapture cap;
if (parser.has("input"))
if (!cap.isOpened())
CV_Error(Error::StsError, "Cannot opend video or file");
Mat originalImage;
static const std::string kWinName = modelPath;
while (waitKey(1) < 0)
cap >> originalImage;
if (originalImage.empty())
cout << "Frame is empty" << endl;
int originalW = originalImage.cols;
int originalH = originalImage.rows;
double scaleHeight = originalH / double(inpSize.height);
double scaleWidth = originalW / double(inpSize.width);
Mat image;
resize(originalImage, image, inpSize);
// inference of text detector
TickMeter tm;
pair< vector<vector<Point>>, vector<float> > results = detector.infer(image);
if (results.first.size() > 0 && results.second.size() > 0)
u16string texts;
auto score=results.second.begin();
for (auto box : results.first)
Mat result = Mat(box).reshape(2, 4);
texts = texts + u"'" + recognizer.infer(image, result) + u"'";
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> converter;
std::cout << converter.to_bytes(texts) << std::endl;
auto x = results.first;
// Scale the results bounding box
for (auto &pts : results.first)
for (int i = 0; i < 4; i++)
pts[i].x = int(pts[i].x * scaleWidth);
pts[i].y = int(pts[i].y * scaleHeight);
originalImage = visualize(originalImage, results, tm.getFPS());
if (parser.has("input"))
if (save)
cout << "Result image saved to result.jpg\n";
imwrite("result.jpg", originalImage);
if (viz)
imshow(kWinName, originalImage);
imshow(kWinName, originalImage);
return 0;
