提交 fa4d4589 编写于 作者: Z zdenop@gmail.com

fixed hocr (escape special special characters; thank to aizvorski) + hocr config)

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@515 d0cd1f9f-072b-0410-8dd7-cf729c803f20
上级 346da8c1
......@@ -827,13 +827,24 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
AddBoxTohOCR(word->word->bounding_box(), image_height_, &hocr_str);
hocr_str.add_str_int("<span class='xocr_word' id='xword_", page_id);
hocr_str.add_str_int("_", wcnt++);
hocr_str.add_str_int("' title=\"x_wconf ", choice->certainty());
hocr_str.add_str_int("' title=\"x_wconf ", choice->certainty());
hocr_str += "\">";
if (word->bold > 0)
hocr_str += "<strong>";
if (word->italic > 0)
hocr_str += "<em>";
hocr_str += choice->unichar_string();
int i;
// escape special characters
for (i = 0;
choice->unichar_string()[i] != '\0';
i++) {
if (choice->unichar_string()[i] == '<') { hocr_str += "&lt;"; }
else if (choice->unichar_string()[i] == '>') { hocr_str += "&gt;"; }
else if (choice->unichar_string()[i] == '&') { hocr_str += "&amp;"; }
else if (choice->unichar_string()[i] == '"') { hocr_str += "&quot;"; }
else if (choice->unichar_string()[i] == '\'') { hocr_str += "&#39;"; }
else { hocr_str += choice->unichar_string()[i]; }
}
if (word->italic > 0)
hocr_str += "</em>";
if (word->bold > 0)
......
tessedit_create_hocr 1
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册