From ee2054da80510870a2c69f8334e3697c5d6b9edf Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 20 Apr 2020 12:25:11 +0000 Subject: [PATCH] Add OCR attention model --- examples/ocr/README.md | 76 +++++ examples/ocr/data.py | 270 +++++++++++++++ examples/ocr/eval.py | 151 +++++++++ examples/ocr/images/112_chubbiness_13557.jpg | Bin 0 -> 6468 bytes examples/ocr/images/177_Interfiled_40185.jpg | Bin 0 -> 8295 bytes examples/ocr/images/325_dame_19109.jpg | Bin 0 -> 8983 bytes examples/ocr/images/368_fixtures_29232.jpg | Bin 0 -> 9689 bytes examples/ocr/predict.py | 100 ++++++ examples/ocr/seq2seq_attn.py | 333 +++++++++++++++++++ examples/ocr/train.py | 137 ++++++++ examples/ocr/utility.py | 186 +++++++++++ hapi/callbacks.py | 4 +- hapi/datasets/folder.py | 79 ++++- hapi/model.py | 6 +- 14 files changed, 1335 insertions(+), 7 deletions(-) create mode 100644 examples/ocr/README.md create mode 100644 examples/ocr/data.py create mode 100644 examples/ocr/eval.py create mode 100644 examples/ocr/images/112_chubbiness_13557.jpg create mode 100644 examples/ocr/images/177_Interfiled_40185.jpg create mode 100644 examples/ocr/images/325_dame_19109.jpg create mode 100644 examples/ocr/images/368_fixtures_29232.jpg create mode 100644 examples/ocr/predict.py create mode 100644 examples/ocr/seq2seq_attn.py create mode 100644 examples/ocr/train.py create mode 100644 examples/ocr/utility.py diff --git a/examples/ocr/README.md b/examples/ocr/README.md new file mode 100644 index 0000000..d3d592d --- /dev/null +++ b/examples/ocr/README.md @@ -0,0 +1,76 @@ +简介 +-------- +本OCR任务是识别图片单行的字母信息,基于attention的seq2seq结构。 运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。 + +## 代码结构 +``` +. +|-- data.py # 数据读取 +|-- eval.py # 评估脚本 +|-- images # 测试图片 +|-- predict.py # 预测脚本 +|-- seq2seq_attn.py # 模型 +|-- train.py # 训练脚本 +`-- utility.py # 公共模块 +``` + +## 训练/评估/预测流程 + +- 设置GPU环境: + +``` +export CUDA_VISIBLE_DEVICES=0 +``` + +- 训练 + +``` +python train.py +``` + +更多参数可以通过`--help`查看。 + + +- 动静切换 + + +``` +python train.py --dynamic=True +``` + + +- 评估 + +``` +python eval.py --init_model=checkpoint/final +``` + + +- 预测 + +目前不支持动态图预测 + +``` +python predict.py --init_model=checkpoint/final --image_path=images/ --dynamic=False --beam_size=3 +``` + +预测结果如下: + +``` +Image 1: images/112_chubbiness_13557.jpg +0: chubbines +1: chubbiness +2: chubbinesS +Image 2: images/177_Interfiled_40185.jpg +0: Interflied +1: Interfiled +2: InterfIled +Image 3: images/325_dame_19109.jpg +0: da +1: damo +2: dame +Image 4: images/368_fixtures_29232.jpg +0: firtures +1: Firtures +2: fixtures +``` diff --git a/examples/ocr/data.py b/examples/ocr/data.py new file mode 100644 index 0000000..00c4b1a --- /dev/null +++ b/examples/ocr/data.py @@ -0,0 +1,270 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from os import path +import random +import traceback +import copy +import math +import tarfile +from PIL import Image + +import logging +logger = logging.getLogger(__name__) + +import paddle +from paddle import fluid +from paddle.fluid.dygraph.parallel import ParallelEnv + +DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5" +DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz" +CACHE_DIR_NAME = "attention_data" +SAVED_FILE_NAME = "data.tar.gz" +DATA_DIR_NAME = "data" +TRAIN_DATA_DIR_NAME = "train_images" +TEST_DATA_DIR_NAME = "test_images" +TRAIN_LIST_FILE_NAME = "train.list" +TEST_LIST_FILE_NAME = "test.list" + + +class BatchCompose(object): + def __init__(self, transforms=[]): + self.transforms = transforms + + def __call__(self, data): + for f in self.transforms: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.info("fail to perform batch transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + # sample list to batch data + batch = list(zip(*data)) + return batch + + +class Compose(object): + def __init__(self, transforms=[]): + self.transforms = transforms + + def __call__(self, *data): + for f in self.transforms: + try: + data = f(*data) + except Exception as e: + stack_info = traceback.format_exc() + logger.info("fail to perform transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + return data + + +class Resize(object): + def __init__(self, height=48): + self.interp = Image.NEAREST # Image.ANTIALIAS + self.height = height + + def __call__(self, samples): + shape = samples[0][0].size + for i in range(len(samples)): + im = samples[i][0] + im = im.resize((shape[0], self.height), self.interp) + samples[i][0] = im + return samples + + +class Normalize(object): + def __init__(self, + mean=[127.5], + std=[1.0], + scale=False, + channel_first=True): + self.mean = mean + self.std = std + self.scale = scale + self.channel_first = channel_first + if not (isinstance(self.mean, list) and isinstance(self.std, list) and + isinstance(self.scale, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, samples): + for i in range(len(samples)): + im = samples[i][0] + im = np.array(im).astype(np.float32, copy=False) + im = im[np.newaxis, ...] + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + if self.scale: + im = im / 255.0 + #im -= mean + im -= 127.5 + #im /= std + samples[i][0] = im + return samples + + +class PadTarget(object): + def __init__(self, SOS=0, EOS=1): + self.SOS = SOS + self.EOS = EOS + + def __call__(self, samples): + lens = np.array([len(s[1]) for s in samples], dtype="int64") + max_len = np.max(lens) + for i in range(len(samples)): + label = samples[i][1] + if max_len > len(label): + pad_label = label + [self.EOS] * (max_len - len(label)) + else: + pad_label = label + samples[i][1] = np.array([self.SOS] + pad_label, dtype='int64') + # label_out + samples[i].append(np.array(pad_label + [self.EOS], dtype='int64')) + mask = np.zeros((max_len + 1)).astype('float32') + mask[:len(label) + 1] = 1.0 + # mask + samples[i].append(np.array(mask, dtype='float32')) + return samples + + +class MyBatchSampler(fluid.io.BatchSampler): + def __init__(self, + dataset, + batch_size, + shuffle=False, + drop_last=True, + seed=None): + self._dataset = dataset + self._batch_size = batch_size + self._shuffle = shuffle + self._drop_last = drop_last + self._random = np.random + self._random.seed(seed) + self._nranks = ParallelEnv().nranks + self._local_rank = ParallelEnv().local_rank + self._device_id = ParallelEnv().dev_id + self._num_samples = int( + math.ceil(len(self._dataset) * 1.0 / self._nranks)) + self._total_size = self._num_samples * self._nranks + self._epoch = 0 + + def __iter__(self): + infos = copy.copy(self._dataset._sample_infos) + skip_num = 0 + if self._shuffle: + if self._batch_size == 1: + self._random.RandomState(self._epoch).shuffle(infos) + else: # partial shuffle + infos = sorted(infos, key=lambda x: x.w) + skip_num = random.randint(1, 100) + + infos = infos[skip_num:] + infos[:skip_num] + infos += infos[:(self._total_size - len(infos))] + last_size = self._total_size % (self._batch_size * self._nranks) + batches = [] + for i in range(self._local_rank * self._batch_size, + len(infos) - last_size, + self._batch_size * self._nranks): + batches.append(infos[i:i + self._batch_size]) + + if (not self._drop_last) and last_size != 0: + last_local_size = last_size // self._nranks + last_infos = infos[len(infos) - last_size:] + start = self._local_rank * last_local_size + batches.append(last_infos[start:start + last_local_size]) + + if self._shuffle: + self._random.RandomState(self._epoch).shuffle(batches) + self._epoch += 1 + + for batch in batches: + batch_indices = [info.idx for info in batch] + yield batch_indices + + def __len__(self): + if self._drop_last: + return self._total_size // self._batch_size + else: + return math.ceil(self._total_size / float(self._batch_size)) + + +class SampleInfo(object): + def __init__(self, idx, h, w, im_name, labels): + self.idx = idx + self.h = h + self.w = w + self.im_name = im_name + self.labels = labels + + +class OCRDataset(paddle.io.Dataset): + def __init__(self, image_dir, anno_file): + self.image_dir = image_dir + self.anno_file = anno_file + self._sample_infos = [] + with open(anno_file, 'r') as f: + for i, line in enumerate(f): + w, h, im_name, labels = line.strip().split(' ') + h, w = int(h), int(w) + labels = [int(c) for c in labels.split(',')] + self._sample_infos.append(SampleInfo(i, h, w, im_name, labels)) + #self._sample_infos = sorted(self._sample_infos, + # key=lambda x: x.w) + + def __getitem__(self, idx): + info = self._sample_infos[idx] + im_name, labels = info.im_name, info.labels + image = Image.open(path.join(self.image_dir, im_name)).convert('L') + return [image, labels] + + def __len__(self): + return len(self._sample_infos) + + +def train( + root_dir=None, + images_dir=None, + anno_file=None, + shuffle=True, ): + if root_dir is None: + root_dir = download_data() + if images_dir is None: + images_dir = TRAIN_DATA_DIR_NAME + images_dir = path.join(root_dir, TRAIN_DATA_DIR_NAME) + if anno_file is None: + anno_file = TRAIN_LIST_FILE_NAME + anno_file = path.join(root_dir, TRAIN_LIST_FILE_NAME) + return OCRDataset(images_dir, anno_file) + + +def test( + root_dir=None, + images_dir=None, + anno_file=None, + shuffle=True, ): + if root_dir is None: + root_dir = download_data() + if images_dir is None: + images_dir = TEST_DATA_DIR_NAME + images_dir = path.join(root_dir, TEST_DATA_DIR_NAME) + if anno_file is None: + anno_file = TEST_LIST_FILE_NAME + anno_file = path.join(root_dir, TEST_LIST_FILE_NAME) + return OCRDataset(images_dir, anno_file) + + +def download_data(): + '''Download train and test data. + ''' + tar_file = paddle.dataset.common.download( + DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME) + data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME) + if not path.isdir(data_dir): + t = tarfile.open(tar_file, "r:gz") + t.extractall(path=path.dirname(tar_file)) + t.close() + return data_dir diff --git a/examples/ocr/eval.py b/examples/ocr/eval.py new file mode 100644 index 0000000..e3c487d --- /dev/null +++ b/examples/ocr/eval.py @@ -0,0 +1,151 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import argparse +import functools + +import paddle.fluid.profiler as profiler +import paddle.fluid as fluid + +from hapi.model import Input, set_device + +from utility import add_arguments, print_arguments +from utility import SeqAccuracy, MyProgBarLogger, SeqBeamAccuracy +from utility import postprocess +from seq2seq_attn import Seq2SeqAttModel, Seq2SeqAttInferModel, WeightCrossEntropy +import data + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('test_images', str, None, "The directory of images to be used for test.") +add_arg('test_list', str, None, "The list file of images to be used for training.") +add_arg('init_model', str, 'checkpoint/final', "The init model file of directory.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +add_arg('encoder_size', int, 200, "Encoder size.") +add_arg('decoder_size', int, 128, "Decoder size.") +add_arg('embedding_dim', int, 128, "Word vector dim.") +add_arg('num_classes', int, 95, "Number classes.") +add_arg('beam_size', int, 0, "If set beam size, will use beam search.") +add_arg('dynamic', bool, False, "Whether to use dygraph.") +# yapf: enable + + +def main(FLAGS): + device = set_device("gpu" if FLAGS.use_gpu else "cpu") + fluid.enable_dygraph(device) if FLAGS.dynamic else None + model = Seq2SeqAttModel( + encoder_size=FLAGS.encoder_size, + decoder_size=FLAGS.decoder_size, + emb_dim=FLAGS.embedding_dim, + num_classes=FLAGS.num_classes) + + # yapf: disable + inputs = [ + Input([None, 1, 48, 384], "float32", name="pixel"), + Input([None, None], "int64", name="label_in") + ] + labels = [ + Input([None, None], "int64", name="label_out"), + Input([None, None], "float32", name="mask") + ] + # yapf: enable + + model.prepare( + loss_function=WeightCrossEntropy(), + metrics=SeqAccuracy(), + inputs=inputs, + labels=labels, + device=device) + model.load(FLAGS.init_model) + + test_dataset = data.test() + test_collate_fn = data.BatchCompose( + [data.Resize(), data.Normalize(), data.PadTarget()]) + test_sampler = data.MyBatchSampler( + test_dataset, + batch_size=FLAGS.batch_size, + drop_last=False, + shuffle=False) + test_loader = fluid.io.DataLoader( + test_dataset, + batch_sampler=test_sampler, + places=device, + num_workers=0, + return_list=True, + collate_fn=test_collate_fn) + + model.evaluate( + eval_data=test_loader, + callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)]) + + +def beam_search(FLAGS): + device = set_device("gpu" if FLAGS.use_gpu else "cpu") + fluid.enable_dygraph(device) if FLAGS.dynamic else None + model = Seq2SeqAttInferModel( + encoder_size=FLAGS.encoder_size, + decoder_size=FLAGS.decoder_size, + emb_dim=FLAGS.embedding_dim, + num_classes=FLAGS.num_classes, + beam_size=FLAGS.beam_size) + + inputs = [ + Input( + [None, 1, 48, 384], "float32", name="pixel"), Input( + [None, None], "int64", name="label_in") + ] + labels = [ + Input( + [None, None], "int64", name="label_out"), Input( + [None, None], "float32", name="mask") + ] + model.prepare( + loss_function=None, + metrics=SeqBeamAccuracy(), + inputs=inputs, + labels=labels, + device=device) + model.load(FLAGS.init_model) + + test_dataset = data.test() + test_collate_fn = data.BatchCompose( + [data.Resize(), data.Normalize(), data.PadTarget()]) + test_sampler = data.MyBatchSampler( + test_dataset, + batch_size=FLAGS.batch_size, + drop_last=False, + shuffle=False) + test_loader = fluid.io.DataLoader( + test_dataset, + batch_sampler=test_sampler, + places=device, + num_workers=0, + return_list=True, + collate_fn=test_collate_fn) + + model.evaluate( + eval_data=test_loader, + callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)]) + + +if __name__ == '__main__': + FLAGS = parser.parse_args() + print_arguments(FLAGS) + if FLAGS.beam_size: + beam_search(FLAGS) + else: + main(FLAGS) diff --git a/examples/ocr/images/112_chubbiness_13557.jpg b/examples/ocr/images/112_chubbiness_13557.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4474a0db2b40a618ecb5401022958651e9aa0543 GIT binary patch literal 6468 zcmbW!cTf}D*C+5KBq5-lc;|4LyVof*L_ks?;D=DFLJ@ozN8N zT@fh>AYG*?pzt92xc=UEXLn}*+THJ&d;d9S&fL%3Kl6W90PIG3hI#-X5C8!FEx;c- zKpOyqLK&eD7$YMi91de*;bdiDW@bThoIl5T@q(bh#S8rWLL$r3o{EZD=RM+#gD@NKj+VD00IV(0g@Pi!T=Bg$bbO; z=?4h@eJ2F?Ujh8j06`332$T^9XJY<4(83M?0T~!TUT4E{TP=WibXMnE`FvYOEI zmadG#L7dq886_~ZRzokB)w^#Za&Ez~a3*dZUgQPQOP9sOCFB)wR~40%wRLp$^bHJ+ zu3OvK+Sxlex_fxyy}W&VLqfyCBO;@2$32Koc=#wWDKjfOhn)K)FTb>`oKjI)^}M>V zskx=Kt)1HOrmufsaA+fSy({7F57*fd5fPj*2eg|aV5i4r}aC#|p2Kj*^Z!WLsl;Zul zhc4H-S^#HreYQ0RF8h8C%T7}}XHHA19fk{qt&x;|8r_`v=LR}YKzBJQ*lxyx0jAbF z)h+OQN8i4?r7IuT`f!Y?-F?Sl=I?s(pgqI1pk)$Ub3|FU;f4`URFXyy#Y_yO-rW6M zVxT*9;;p&~rf-hl0t{d>%Cs+!)#k6Zd1&FCfRq+R^v6H>4kDqe@ST1Rbvm}#o6ci0 zX|OekdOwYclRnxMe3F8<`FaA@wb%KPl ztPSR$YtmeaD;EH<%+b-$ZS^x|e`pQKyPXEaP-U#?`>A7D)O9PEf52tmhrkAWd*jTW ztqQev6))(Q7B13AwaE&}XgeodUHU9NhizR+)3!N2LF$^;Oz8LcI&@f)_qf)or|5B2 zxX_hiW_G5`Uc=si?TVc*mrT`Au=<46vKI0o*3zBbV4qi&>Mu^~Aq+BS74nNgJ-+Fe z4f#b!1;HL$CQ0TB_!kU}ZfRo77`W%Ja;u~vzuAkwHeS*TXO_R7TyXZGs3dbX5Vx_| zCkLTff^tyhFngtyU7Eq*WS(kke&(%<>SFOdV+Q5A-gBd(&ch^=%tIm0aMg8)_DsbS6ZDk$aY!;Sa%n(Eiyn`SpG8V4!1c^ujWn^^CL#M`J? zKb9qm44SRztN)8-?8nX7HJ*ka-0W5E+nhSvtEzW?wx|!WhkDWUt|d;~RCeZ8?ouXH zioQuM_{UKsy*xn=+(fvSy{c1x*P^gfRz#C2^QdlMgP6`?bRE=(waS zP+Tnd_KN8)(;FnA}pJPuA$1siT{oVbbhC8+4szs{!^trPR26QBDM@7N#NB86J_C zP{J_A3A)!*E?0e;!S$q7r-19Oo8jbaQ2HBPOgM)aXsI@TFe?gt%s%XBpR?8~>q_Rd z@UUp$S{}ZUHs}UG`*=x#pjW)hc`imoKy&Q?!vf@R8LRO?cSh_mPQTcM_&mq6yTnX zFXQ}O>!gLdvG5BRG=>yYPql*0lc1M-HeW^8Bg2waZv2d0sI@GA5de~R2NnXE280w3 z=XhPhOI(ne_~MVIlA(chp+7R0xvAtOFYClhfmkhMU&c{eb|Hyz47t9wki%>)kQbcj z8Ad!{2+h)u2*UUaT$6G-$ve8brP(1r+L!`>NdVtkr_(2hbl23vKY&KLq!iY1#^(cJ zJejR=<@5o6(8QIGF<$wAPXx@ql-Z@kti?15MLoRvNxW0n2f~ZhG|2=O_Cj`9o4N!- zso-7`VDUgN!a+>%aEyXq^gd~wE!{GB6@GhVBk4<6PijoWrAFxEk3e-kk8U<)dzYvLTl)tUj7`Jr0n$1e5Bhbm(}v37{EE=eR^#J(T4L5`)RS*38XRnY@{G+>Z;N4Ots~WO z(m44Jc2<3COn~H`>6iDg;L-2Jd&)a8`k}3@hD`VB(%p&M5P-!@sIm3*z=!PkQGTUK z?c4c-4XOB7u~(6HMA&g%XO#NHDvz}6maK((OK$0<-#6z4CRqXIYs`^SE$1Y_OeIka zv$=gQMQG2C5I4AjwONw99yzyCiPt(b1aq9fJrVS8cT+@sx|Rx1L$x%|h;&g?uw4VF zWPZWh$^G9nS^K3pB;-4E<~&R;jpL$Qn4;X}hpsVFCI~w&cAuBh3@6 zg{`_RTc^UR=a=l1_hL<_nSqIEUA=EMOFu*~!J+Jt(YAfI+RmjlHLZVX#s#g1t|>89 zdT-62aRPHl_&)%TwlDcR3L(+1tkm~2%r+Kgy~THa*W|Rdk+;JV5V?e$X>aVd%w&Lc zN!ISX682oD2LF$}-L?~GR6{54itSLsnnTRHi+$S4g(T3gbss)6O~nz$6#lADNJg}U z>_x4WzQn~@L#_(xu;lU03tB^kcj+$t{*t)26!;98-s4CL{Ki|2C+fKf+ zbo?zb=o=S)Dk?z+K6tyRrWZwm`a05YZ7bjS{X2D*0-5O03}d+a@8Ti{`UrhJBkmnb z4418Byvh5z=6|GCI@=%6D5>4!jm%|1pNe!(o{Hz|fn#sE6|!8-Lz(bzeq7Mn3iE?w z=B)i5ajK20L_T&+6SfnZza^gb)__klrg1IBIHb}ieVGi_?T!un43j;VqvtZSYrC(| z_&y9?FTH$kS)V`F$AHXH?Di{VszYt!sSE(h#(dstX-8ARi_xUBr}N>7K(umtB65MW zcWHVk%2r6)q!w4K(R44&)ld3~zh^tgBB!5{m%hbzFfvYX4VkXa5Z56sgC!yD>~O6o za+q`{`|g3A=2WWZp>W?<@5Q234Y-gq1d?+IwXL6ExyobJlT%6`@d_kJi6%uB+eBa! zO}tBScvtd9mEuOxkR)N>=CD=)$eBzr=E+f*=!A)*b<;9%$!s(R z|7*xJz!^IxjOEW!LbXYHK6+`AZ}_S!%Z%EXM*MQyKh7+HTVvM%Q^$Aziii-*e2F$h z@Hvt{;z#*cdW7fidpCc@al5mR3D<}*Nc7&P$;gbB7!|o!YJ1)D3RFFE&Jm$GP666% z3`I`+lbi#8&*#LpV!Lb7(K0PlLRL58O30m)30`40f|?yd)Km1tYiAAkup-%m<$ zvB%>Kw+j}HZ8-cjbZ57UN3ta@VCac@kXn*&hfRlbK|$38ad56F!KU{Qz?Nk?_Dgz9 z8>`UeD1FgZ87rXi_~4_~R=b}Ma&eO$9jO%tr7dA$E*Yt5dYb7q;*(qa4O^7QCXFO| z1GPEraMq#ta7!nRIdMHTc9?GSp_R)J-NzP$3M?Q*a2OlY_c(Ty0&(F9OYXX?9Fl?a z^?jm2eAs&6yjEr>san?2;o~4Jq0Fap0??H-3%~A`Oa+&XFPCp*BFdfbXX8islG&Nw ziL{Bkt3)QOy0q*1kZB5wb8ICG_d>A&`%7+-+*5)aGdMs<_4I)ODRP+EP8g-1De{`+S_2(MWzcvbgrKE-{IjHr;O+4O0L_~%}^SgOBaen}*5ler7`$?72EFnGwd zw^w4PX2~aiidK(~ls^;EPq6aRro#AUy%I--<-}-7l*KYcbLQ%7e#}~*@}13op5p;B z996`PW6D{{ouS;ogt|S{jHhp~NWHsYI*2|wYhT5`nKMFxI{`Ng5 zdKEFr$q$#o&syzRHpH%dsjCmDtxQrHu|t&b!BXSYKQPtbc#?=T!fgSMKg=~1=zLJ0 z4U4L;@UA>h|MEFt!@pGALw9>Z*emFo%;D+l0qchRvFme~jjrx!&jk@XLGQkO19sMW zM|NYB$^t(G^-!Zk0|Kk?W+@0Q)Zr^ zChIcZYKbz48AErEm=@f>Q<4_*x*shop@6Mb11@H=*H^x~ zUtiPDuJ&({PY@wN14``qF>14;cJpvhjuZRzO4arGx4@_hwVZP7=&EOy(&Kh6NczO@ z_8;ljrKLqXvPa}{Ll)n2X1k0M^@ZLt8m=wey3?h0p0FD|O_!d6E`~45{hU)#=$|vY z;KTfq7g=%oxCwvM_FmNj)YpPfY18PtRuuNLHrzyb2A{1gyonag{)FH$hvkBBGUJaHXr&=Z>%E~zfXt)!sezbe8bbrZq?!ykjUfOf;}zR?yIKS zkd04RgL^5nYMek3mWC<5_EVetiuGwn*WU#Wlh?$PF9Y0>_3qtQd;y(R+omzY)uoDP z-c&|cL(0;S=iRJfxXt$+>#AqWj;U|EXD+ofc!v~P)9}?amuwNEEnRwMP|{Q>>2hm3 zZq8VlWK_8F{>6)179CeP0K6m_iI$6>L7NGcVU==G=i&*~QP9j2E9xs3uPmWZ%hz01 zY0W^9^CN!PG3J0UC1~BE-_;Y#C!6h}=FTiO{O@R_^IWPCxd?Ilt*~1fzHBoVpe7!b zqU5X3l@!b^d`w>^$rq9BV0$3($g_u5i;_?#Z4a0G-*Kn4zujO2;Won!`7h}rw$RBZykK5horVg6b8s00k%T6`nIFbT(;@MA`iTe#lb8P7}sZz zedt_^U$^qw@iN?q7oX}y4Cj4U5G|8Pj2RS->}y{P=~Mpb*xHhqKRJ-zNuS_!S~fVz z6R}NS{k95ar%E76ihu-}si^905KXr;(vzC1+D?gY0@KNWEj)inb)LYM z%;o8570FC@;V%JC$gh{jke-&U`dGuMM1v(hBJq@W`l$Qz#c}}~Pm*Gxz5U};M zv3k&!U)TKFkf7RcGicVU8Zr6J=a_t5zVXJw=(psWO0UFvWWP!T2N8$zG6`@kBpEz| zVdMweTH9MYidViIH;%}MH}-!DsXEM*<6R=f%;yHT?n(wVC-4ybzYnjZXjk0xMhP9OWd44ZsX=M2CABt9qsu#u^@VK`x}^fcB0^tEY5D6t-1^_ z+DBNa^gENj>cFGd*?QjB_EAS}-faQ#yk(7u_RTf~%I$*OVfr(&1OSU}YkQykMct$h zo?~+*wYK&^HvyBWgm!VEELdNBDs$@e+xeL2M5&c@fw%TsR*|(aB{8M{JRa7Ri0vlI zIErh#zVs2E+54)h%`(ePjyOX(Yswd!Jl>2`HESP9@dWJ-@&uN7nEO?4BkX%A^jqeV zHO5i{<0q!ou2d5fM!YbjnN5(BysgMj4ciHo=-`Eq7sp+G9^v zI~d&hz*37DEdkVg)xGno+gWIAb#gFWE{p28tDH0b&@6Lp$BesG?sXG|@|qm1Is%vI z2aFDrUS4kp+oEGx!ha?$YDS=qFP07Ash0k)I&D(jaziz+!cV2OT*CmnX2dve@PYBY zdm^peJnH_6f-d>ANjZ3$bEo9KoeOd2ml07JDx|P860kD;6;Jw|`&fNc>HLTf#VLgQ zhq(;(H?0IL5D^!v35|ah*E|yX2Y~fze>5B*;2Cb~qS_ifdPWM_0u3x^6~0^tOG^Jx qaj*VN8(8Wh&g=81CVQr?$HO1Gm8Kx*A;FN!KF604!A#756581TG(ZrDh89FeM@#!B`{AE{04)a{ zr?|2%J(raS10;+a9-CdpD52Lf#A7}GSyIIl8wY0M<>MC+l#-T_m6KP!uBNV`c|+g8 z(8w5JVrpY+XYb(XBN=kId!cVJi@pj@K{D}z3j4< zA+Uss^=BSW>^u{%r0SN`m;ccIOZI;Ui~IkQ{V&-6cC7gI6*7NB6Brz-IgK z2Bl(VzSokLWOWB`m|?yLN>6_S>OUCEt1v-fkgYH{>tplAtmK(4WW>+AxkTFlZ%F)Wbxp?s zE=JoT+{)FB$A6k@5GhV3L~1o}`&*$oW#u|k+4ynb zqbjk~m$fx#4wIJg0%DC2=F>JMO7Hy>e@=&`f_=Ldj1l;QZFU1EGWRK2uX3V5q_veh zA`b`1+GiSS(c${oW+rv`RZ;v6aakMn*0E=1-tVGA{#tfCSD-78giP- zyzc>$SN25kGXgcHUy&%gvz)LtltI~XY2tEhc1p6ga;%Z-^bu9K#@Y@E2&in&J4YP= zJ5Bf#x?G>>bEBeK?A_}}(T*xVcYoHJ-+csYNPFzB=f?k`J40F@ONqIK)$Yw5a4(aP z*JV=UldKV|bL;HxCVB9k$46)Qb$rOsc+MY)ku7bHe{@Qk_LilCHIJvaQlaYX`7zK)sl)gI)Av?`)*Ra^D@aP)5|DBhz zpkqrep}Z&;EKh3sn$xFqYkvaQ7|&tG{+&gGPHR1c?kNjRjc^9UJznhHY#YF*eQdlu z)T57N-NAS59C1xi>M==`D6RTy!+k5ph+Y-e(jE3w4HMa~MzK)i-P=Ru?>~6CCSiQT zR3LeFFIb8m4JA`_LJB2w#RNsoo~~!~e`{DGEYy+IdY<&ZDyniXMMOPox7SFSl%yB_ zIa%j)95NfXw)v}}UVDq$z7f_YDl7nn_aM+@pI|bdoAjZj+Hro7qsx|Ui+({} z+GxQM;J-QK74qmhW793N`xCRyC#}igNleBo%cLa8MfTC4>lt6YVtFqI@cgl#I|zjb%?(u8@q~A)6soO-8U6 zKjriiHC@MUVKx*mB#76&Nh+wvsZgGGXrath=e+v-9DEA}PoOWgx)jM<#W7M*HqeZU z@5Qsrhc~kNF@H-KDopKa!ILvJXx;I4)49Rm%}(kr!0`TC!Nd{Qj9bx%yWvdHQOcLc z(o6#$-nMibsaM}LuXQ~e&&X})kxRB3UaP3#|A?+XI*dc{ zBYzi1U9gVMy&0;5zxgelC7{V|(%T>Q#yxm2>Em>tF7KwBYB+nO->TXnJ*HlFZn&JE zM(D>qsRin?$Knj6eDtlv?T@WD>cbSucFBD-I}Ep+SpIQ0_@UTRNBI;!1`5g8N!pX! z%C2H*jJ;6t)RDME=*^y%-9DQmc_zJe4n#D$ceda2UdzwLe{2TgPFD5r)VKWHt)Xky zxz-HuHmnNBi&De*ov+}oxp&HTn0_%$aR{>7vMp+{o&y5ZZa#W=Q_C{`NS&{zzsFN^ z&2SFb#xEXM&S|Q|*;cN0>x^zao(FV*!wHHlj8SLDHc93)&1QR&gMGeuESZ~uHfQcl zpTdnK`Sk-nz4vYze-)WF*BQ9oRx*YpPB#^hUrH6Xjy6wP^NBH?BhP=yZ@-&3l0J=Q z`Fvm@Sf<6TcvKe0J}f7^Wpbv3wIRBpDli?<}Ae2A}%1&9%bzM$E?O`kP zWg{+$7SmY8$F0{z*!UZ)4>mEPw-rA%B$6^9rYX&nR{wf}IJ(YqA10HtGk!zW0n?eNUvkf)_Kv|IGdW_5u`MZXgj)So418b*$DT#nMr?;4dUT_e-;Bb;@HzX7oB)Qit$6e;x` z>U5V!U|4vxN7chz&-G-(GjCCpIY;)Z=$y&?II!7L8u3G!aFJmF2j|o89vXe^v!7*d@54E$yLsoqpbyG@93D>n20&li z^XGQmeGXvh?tXI9Z-}LZ&QbK;J*YcgCc~AFxy3;$e>8TwHb>leCWf{k_i(*G0hcE7 z-5CLU+10^Y3QlO1JrnW)UoKXCAQu5%{mq2*pc`)s_kDEDJ<~Y-sj}Oay+zLO^iV=? ztakb<_^l7GWPl~i+E`1tq+|59!T(_(Yd)B6Lw+HY`4gL07 zWr3DV*RtZBvSUH4^8%!SnuwkDTHm+|LriIG<j z8AWwi4a2h>eU&HTJEhF*uR`uA9$)9~?J7WTWV!sjkhQ)Gg*Ts`7q?W?IRf{*VEDjy zX%xr9!y~);qli_5^Ob>!DuF^WwtBIgdqcZaSCaL3RiYHIciUXyn$3z0GPH?V>E>8~ zy>ZG5C)FCTfJC+q@8Pw?qgO>XGYnM$xb8$oR!H5|`@`K)kD1ELMr7v<)m|f#PE5}B zqN1IU!LLmL^UL4Fy=a3o;bb~g+_}JqGrMuXWnE^L0LcfAE)h@ye{D#sw!X?5-09kz z!wZQ~R8N797CyqNWc;&TF|Z=unyGnZ4j%z&F8$%MZb2Z-Um83-vm__@HS@avkUWqD zp(8)hrtLeo8p|-fYk%eo-El3d2c`*sIB%@d@a$?&zE49FcUyi;NBoPG3%iIP%}w)D z`>rlhB=C-tFmB8~%{uQeiV%j7*j%1!qS{*#>VN!kCdNOxND~oYykJ-}oT6W}`0HkvbiD5Ne70i3u zUlH~`#P6}iw=|ZU_`S8#*M5ZVnE0W0wE3ernY`1qvAHPohAT$r`hJp6-^$nc%5v3I z6^gDe(TR-Cg3~_EWysOLHc6S=tPqJEfONEWaT-QJ-`gb7gRBQSvtRVA&vs}hL+dM1 zS}|fWvt4R|)+?vyaxc~F?*>FyJytx&W=|+H60^1*Y^9Bt$AUtyUZ>9nar}B->nuRJ zcsiS@VBNN!F?Pp~I~e{&JZS&6B3Fz+jGU8`t70u7uera!u*N)qiIrj zu8x4cp-l8&E(MD=i{I`wD9R9oR&cov=DnCU#$o(`DOZar<9VC1SJnhbJfM(b-}?ZW zzYfzxuyc1u`jw~Gx8zgBO3y&_HeDD7Q6If5#{iHGYBDk(b{ukERmM64=X>U1k6ty1 z)~QAA%83cnGbsUf1YI@<@@{6!pGk~`QWp?Pp^@I^b4%w1sXiQT=zRfB8siA&w&OvW-LkJilXin0_c7 zqd-+t2X)5`^1wDsUOGCvn4bo~@6I_dpAdZF)Wt|1URHQ-Do@sjTDjW~FaW4}56^jF zr2&Dz8^-lQC@c&<-pP^VQj{kX7DBX`Gy%}BIcKeZ`rgpvKXo3c1(-L&qsUKY)d_h0 zYscB;XZ+G~S&_U&vE0>(y*f)9EJuR$f-QK}iA|%m9SK8CPuS z`ovb+(_8AW;EOBgHr&Az;?3HmDHZzZ+QzfaM>|C=rL5p<-(*K;p_Z=0Q*g7O+Gn-n zQX#O%QCIRMGC;*YQU&Z>jump@&d7!d(ZYcGSwODd`bIGG zO~XHr_?GJ%IV4sf-C;B+RYFF4oJqf0O_uz`kD)}Hgi%ay)WXtZlIJ2R@UEgpeZuEf z141_cieHqefV0K7L9h;#L8jUoyGt{=9)Xt{D0?8a^0ayrg}_h$crzBdz@j1Qj$Wr9 zMzYTc@!HwaSHvmIB9ZEOpFic9@r?C3+=_gYhm8|(AX9bL#IOuAxe`UKE11Qpkn+3W z>qMBce8X}dp0UK{RiJU7vh`RT@hW~A3<9@*Rg_xrPK>;1KE_%%(YiDiL4{=D4=E|4 z0nMqEKG0~bsN;=Yn&PUumqqVjx%Hi_V&F*0JxwQmj0qA@&E+MlymzyaogE}N|M^!{ z#`T1}sGh=^a9lXuqN@Ecq*;RTXMgFj>zJgB2QW$j5Cd?9edP-a%d2F*8 z&yN(}RQN0P;=Pd5KOYk9EajCI+ycTITE`yH9nJJLfu{C0$1=JVI0%97c(}v=UHVm$+X8=X^#)SGgb6l7r5ZnI_-_>iy0W-lwR0Ss>WPT#bCw` z{MG0C-p=b^%`$M*ur)+W8V^3D>J)jSpPh8kF*l$~okD!xc$Lg;i!2Ny2X$Gj*gUlu zjw_$Ql~6@WAPpGl0WqE65Ttt2hFIJ*fkNkBYG&YWJlCN2Uu`Rr$6vaarJpK3`A*&Z zxAZyYm0GJyoc8&(u+#jg7gb(ny}2e@9Ujf#O?wZ)>L=b^neQ~N!Q3W|GT(@h3nz7S z_1m46y&3qiF1tqno|kxJ;1bHe@~M44-T>*T-Sjnn;O4>Lk$ifh9{Z5Uu2hUvozZKP z6r>25*Tw~<{DjWTcG!M%SGLj4>)Ed46vz4###MlPukE(!`OJ1=&tgb0tZCtG;A_SA zin}D;&dXNBRTiXdw(f><5(vL3bLaEdI$VUOH!{s%s@p1PvbuJt?^nHc`(;8k-~%_4 z3LzATLzvK7w1$SkuK6FN?=9_Ca}0Gd)(;pBuMomMsY>RtRN^4 zr()o7CUD$#+C~((>+fmXA|nYXMA92%*4-5^SY@@=bUS~L%}3-}-#{Rz`Ubz$*a3`M zlzzQiB*pL$9rabCZUk<~ITczZ>8oB?i50t!&*Ou^oH%xOd#%b;$NHh_qAsR+V-F{UUZ@ZAEc05nq8BGxlh`6nFQG02b?i0 zG&KhlG$sae(`W{JXB;m{)YWz@+cVYVzYnFHDVs+ncvzcMy12HQUz6y~g810=;yn``d@xPo30KS)^5NP7{@tOz8RiytDH{rqLkkbeulD zBroSw;Pu?)a>@w5H25TfYUr5r=yP`ztSjb&k%@c_p8^T}d0k~T!VSFtz%B`eE_Q@< z@CDrqwbhG_BKTZps&6^5L7fL$!Whm4*U>LJ9gh}l0_9i*f{a|?Ee-~bbD=oZ_0t9bF6W}u`yXl>U3YVD_tMPAn{;s{}&6m-VQ1?!&w2@g%|7+uUzU(kTTQ$G%b#Jm6+-*Y_t$p_J$@aY1rc_^0Z5RtP{|LgC zCkPZn^Z_WXcEbI$Q-1^6V2x$53shI%(|BnKqmAR#R~3oTlvbVcfF21z=yF~Y!oe)U zab0f*1gZ>(st^+`z1jf4evj_zdP8k{fTME&WIKP?hEG&Nu{936?%(ZG0Nydc^U6}T z`>eb+JiFGRt!nlSpU(M|byhCSo7mJ6c^V7>9?!3Ik;678n@egO*q#Oj5#G|uYR>vu z>9A(dTGa<$=!_{*^cGX6IWbulCxv6%RIsnioq#-~I5K(zz^28Y+ z_vPKM7$lBB&+i>6zs05}jMWth8)v)kzDKkV0{jSaKP!sqe#(XrlRLX*4IUKxMjJaP z3$lK%^8}^E{0Zi+h#@ zfe8=AX!QoqB5$yA#-IGkn35@r})O*v3@Hp=6i2oFVG!*5Y_HEj8#nPx=U zF$}?1tW27a++W_UZ*D#;9UUB!?}lMpj+tnv7d*xpX*}aQXY3O40B}1hu#VEJT4OLthWBI2JQ26SxpE zGV*YB&W0Gh?UQ70-u1cXb#D3?jMU;urjd!5I04c!A(ZumJ>A3c*xiUKt}h(uWWP?a zjQdoTB;b=NOGzdy7oP0TuuOi4gBv}Lnkk;Fi1DxJP!n&s_RXo0ir+&=4=2mE{QISn zITuZan%k=DF!N0F>OMw?Jwyjq&Pb{Y#sA7xBw8EhGI#s4iT?*~6tr=?CdG7MmIb8K> zhRMf5tr^FjFgCnH**joLOzG){iMDJGGO$Ilu`pbRQ`krQuWlV zxHNiWp=B~>6IfJv<6X4Gd`2IxV@e2R!EOJHQm@A%%{0+Z{{d%Ubk-F|j|G=9TER$v zo&fpO|CXHY#m_CtJG+c~g(%vtSI-QD-a5!2t0YayhlqD{l+1loUr$Sz>eYu#1(_J% z+sz(ja8P(9$3f7?2zg5n&`?SesNlYwT=>#i-LaWJd&oemc+??7tgia z*9W+l0UCd$xwcosBHZPQbu|xkB#n0Up(NN&Rl~DQ+pmpJ9Q1uQDBS<>rVB!O4gJJ1I70sA6m-^uD}F$%;I)_p-09S zS}W?+NpmR0>B&T=@a!(WuK*B*qT0hVfRR}o;$4{vqf==loGj#p2tt!#(SvyIdk8#q~FRpO7Qc$cGraugOUf&#d zm`!INKuO|nwEo=jPxqi`$lR81I6Su*s_1HoTUg^oMQ_JPi$w=0S{#=Hyq0a_j=}Eq zjG&k8X(neS+kgKseK1APTIx=uqGn>=_of399b_wWls9^pK7*$)56V^lKG!1F@8?g+ z6b2kfKs2IKM({0({3dz|8rC#hQR#k^1Ebq*V^vVT5S#=`%TZCkWoG)&6fs1jDyh6d zmH;ty&>a#kwr`rkEZv%IdwWPF0>TIjGDm0$DmW|&@N?+KQ=%WpM6zw+y6lm&ak3@S z?+iMq+PlAGOc_L1wUJ0AU4=_rzr zpA%G}=Xjth@UHnnWzXW*$rG94ED0ei;e!|UEcy6%X8$<++Gj3OVNbMWS^j(2%kfBT zxstfwC7;()N8_|+M_b}{JDOT6>tJU&xgXn}IICi{i0RV|kjOqE&GaV6v6Sg?=EVC}=cum~QfUf(^9Ubw{Shhny*b_5)x#5W0gbz+?8lvR^UU2fp> z-GPjz9#kKe2sQONI-9jCUO4npof&@C{5F*mVp8+gsBp=Mu1*z7}Aph~`Y=7dk**KVf zlC6`#vpz|rLvoHD3B$AQU}5Vs&}aS*oeojiWw|f9{bDkerl~ZN)(WkCL~G#qjU+4Q ztjGQ&E{dCfC5uB$;hJvxWW{dEyyc9;2cyi?A zP0zOmIV=1?6Vf*2>P+`o#$o_7OT};WrG6?|R9%+XZG3**&;bv67K(LRmUQ~_2LeTk zd8&ab=??xJQZP3=*Mhx&eS9zcC0Ok^cGXyP-@#|2O3&$OXgJCEfv;%q1B__p!_c0{ z`T<)7X+e7TrU8rFx!8a2y5Q@_aN^@4A>kqk_ffy_3WX0n??DL6%A5^tzBAdz`Az+} zDy1^|U9nvaDA0Mq3|87opcuEmN;{(WKWf$<6H0;bPaLlT^bhGei45X)hA6?PQcQ5f z-wEmtu#j+;A?^gepe9Lw{&6J<$t?a~XT_D_PJeDsd6u7>*7z=(*N0Jur$w;QQF02$ zic1GoCV{2GfbS7=Y-J^7>Q;8HUr@UUoTk-zN@{q5SV8u%zxm^W#`Nm8 tm|B7whatqj^Y8GKD#yg*i8Qu7`wWYj{LifvMJx8|$sl>U(ks8${s(2TF;@Tp literal 0 HcmV?d00001 diff --git a/examples/ocr/images/325_dame_19109.jpg b/examples/ocr/images/325_dame_19109.jpg new file mode 100644 index 0000000000000000000000000000000000000000..12554431319a03fedd33a51da806414b56e2119e GIT binary patch literal 8983 zcmbW*cTf||-yrbNJ3=5pC`w6?E<~yzy+@=-NhnGS9Ri3HDFOmgf>J|Adhdh;2r8iy zdKE%ZsgDTw0E!|HkLT}wZ|-L9ue9I(i01CT5mD4Z1i0)KoMy)U-5obhNa8T1Wq> z2heiT0Ywya=(#Li7(|1(VR4z|jAFX&B#_nI-;mp`!SPJYJiL5heyI3O2}vnMC1n*= zwL5zH28Kq)CZ-RpZEWojNP9PTl!vF6x6k8{C!t~C5l^2dBqqH`PD#zm&dJ5*?j4i^9aCHr5n|LvjxSZS#KEFKLfKnsv1 z4~E%iPh#9}i8nH2Xkm+&>OM)8r=C8YrS+IKIV_2yPzSPw9mEc|ie(Pn?~kGH=1cP5JqMRDBAid`{BGT2 zx4wGMr({gC{=HK8+Ii=E?9GgrQ$zD!wK_J(c-Z+_hIN+aaYVxmOY5fL!+$VA+H6(B za;gl3W~a{iuisgV)TPrB8L+9a{upa2c%#h~CrkKqGyM-eEfq8q{>>$WJ02hi8Sn12$x9XqF4He#uyVVmT*o6Y9^qy2nQzLLRZzzB` zIu?Vv&y=ax{7}I72~!vR0zKr37JovjvPcXHK7jSrRYu%hy49c{I$!rpd>L&Cq0M z+9QqmwUOn_W2)`bZLB+IRpjFr+o9WMhTAcRU$d51J+NC6!taQ2N41&XCVI$lSnQxo zV-h>jORk4vlRQ@7kIO<)Rl)a*IBbNZGz}{Jby5IAeL;*A0=3UW;)yCda~?9lYLC6z8={f*SAOnDg3`#qd^jqkcd?DRNoI zA#L$aX{_Cn+X>qsi4C9U>-I!YH++3HU;t-~dHn_oqXOVRYU=W#bSjPedb|2gKSrh% z$hmTW{Xd-o5d;~jat&OzApl{f>$PpQ0eg|ap* zUhC=0hSRln_gc+|l6=F`g)l@U(h1JlhGH6&lGSNiGx4tc z(D`-D2%(-CDIEwA+Ls_+>e^Xk)d8s@1VLpuc_Oe@;9-eo(v*7P1j+h!+o-==3+qD4c7^_S4H|1bJ64-4f+@JTrX1a() z=0z6m__kC;ZJ|MAxR@oVn~K(_Jdd4ymAhqr*!q)>E)K9(ckJ5vUZ$nBrpWXS9o-J6 zx@_OUCD;Y?RvEAbqgSD0q!HW`Kz?i1WSP<&wCE(J3wsnSnpb($Eo3Whc4z7iRza}6E850 zKzk@HTZ4};l2gQ6$w}X+b~{*hcCu(OMZ`Q$#3A(5G)ZrlW30u=%=fbxmP@u+!lQ7| z|MaaGOMk&@5!0GE`Scj4!SHc9`!33i(-NA-sR}!~NwJqd${Ee)-tiZ= zWv6cR2TXOgxjhT=_N^eHQBV?_NS_zDJfR75d$3bMWC+8Ym}z|a%a+u;;1^O~hEt9Q z1AHCSlPgTtjm4PW#jH$!_l61VNz7o!P#|3QBeQw&#qTUGwaraDU*38ZYBzRzX_Mou zLBj3Hf=PpHgWfaUU1plcG(iD#aQ8D@#d27x(vZj^wFTD?P=1TUkI2|izgc?n!L3gd z{*n-ztYt9N>0gWOAlp+io;Sq#@3`3hn_5KS`N&9wh(nRU<@0bn$7`wSm8zRFCW7il zXMjnO=R14?In<%Au`P{NSEcimQz5+x4^J44FnweSjqI%1LDVucXBdI3;#|CwZsMU= zwPAif9>)O&F3C}BH_61U>`5J5z>A>un5~H%X1-Z^cROH{9Tuw+47uU05M8sU+8&LQ z!k+FrH^H%?0mc|-7=XLea*1cyZ2^s&sFcOa-Hbo}E2@qbQv~r4WRL_)AE@g=Iea7lBC`<18Cm;9qr-F~Td9e$U>Xfip_ZIo=nP!+$RgYV=$LVGGz zzsbwRKhh ztq37?)PR89)tJGV668=AZs-GN(hm4|zVMYc^bu4j2l@&H^nT`O{A6mHP+Kuwc(7F;DYcwnDH4-(4=StFNG zS&Tn45vYRx0r_O*f8GX%F5*uS5q1PP&3aC6hV)=P#0y-7QBSeeo=P~~Fd@;c@PGjp zgn1>Fi&~U#zS)*m1{Ads(%we5O4u!U5GdOQ!#Xz>g8)QyeFfyak*;r2D}44Mk>kFr zC~5;2Uk)xd-7R>*ia5nT0mqxO)pW|@a|PwgI%6F)YGgm34YxytbAX8#bHT)*Q6=eh zX#8qU5(2wam7p|cZ1uCrjWd_RqeeX{+8EW8dH4`<2x@J)E~Na!CJV8jB|idgZq@j! zYQ5BNju-mYiPZb4O-Ro(KpV)X%N2LFj8XPG*X(Y=U348F>^%P2e^8^anZfX^p8ECL-|aNexP(iBH!!x3(vGsUs2=KZ@ghErMGr2=YWxO#nvSda@?OQi zZg`I|RtjeH%;#YLeH6IHN47?@Sf%k$#)3nOf9R*?;1MM&OrieDH{+y&`^2OCH^7PM z%8TZ`oZ<(zpRUVJ-ZG0-jTu4RF@_CZF=DfQE;5uz@A2!r!Wus8TFe1$wqHKEXFbvK zyip?^_cM6wmH_hkKc<7f0YZvYu4wwYpRtOLxv^UTcM`tD{j~GnG6xQR96JiRx^*9T z&v2UrsM{{K&dW0LJ-JUdVWf7jdVY7B0pvCubL+(*mo|alFB?oHYGKH5SFMck+g&Ry zL9ZY3KKy}62IQ7tyVCrdrCqyi-qhCw?Y*r!8m``|^7S6~Z&E?Re%X1v=1J(FePK$P zj(ldiBlmEXxTBkII7&Hqc`r)T{>EGe_$S7KUrz{wA#<7>JdDj40H4DDKB@8~5!_p-+fh zbiv8hYdO3&OyiT7N_uUyA->Q(Oa)7+DUW{o zoO3GhXb&cgF#^^@UP#}LO=Kp*yD<{>6=eUG-JTOrNe^BU^e)%6L-z-VV!qRt>%p3S z@c`j+Ly~AL*U|Zj$8Jq;r026c0`*6wViQ~DvP)@ldq{S6c3b(%Sbd5s66xy+?~ zR~kY2{O}sXWCs-f*{?c1aS$-E zFKI!=mMsw2eP-_1X@EwfyhCT9jg*ICdZ#ApaK+SFakoutVOw^x&IMxISB}ox0h$xE zO1dfuV(aV&svxSo^&15;TpxFR*p&-BNrdDw#FkMhPMF;CTy*VQ0NI~D!$;?vunp*r zSt|x>YFfm9R5RSk*oYdj`Ybw&TP;*i^EEVjgfrp}l#XzD5Kh-k#{;4Tjk|xyWsmW2 zbf|t?SR*TM5aBfson-&lekujBUSN1@rwqthLd=`6Xu1IsYJRgZN|Aj&6Q21oH(d^> z3b`j*!y$Ab#ui)VOpGPjXiIx9-@!aBuJ{eG#o=F4Sc-IHt1<0&dvb&MPylBlP5pXv z{3If7a@4~19X=bLW|?{rSyr(MO86=91#hM2FTL9+Ra}pL9W-;idR2+`7cSkB9xg?A zcyD@7wLgz6na{~+fEPeqZ~?o#Y^1e%y2a&C0)IND!(m27is#qXGCa=`L+m!s)j`W= z#f@xdXGd)Me3pDCQd)7V(*?@q zQKI)8U%_~a?-j{|LzRa!U7&h2XA(5tWDWD6z`~r+cYc?0<&aDs6TNl0?^4`;tb%|bUzs@Cqg zx&xc+quSKfBHT0#8xoEHNf&WYYgiI(f&Hr&Zu5{X?AbjNo{`0(h*~0Ehe`LZL&R;( zt!mBSxW{``8Gt9*_pyk7C$Yz=0UFvMIVq89>~b)PLS=UhD#?m`^PMBzel6$9LSlE%mn=v z>xvBBnWV-O&GW+DDU*k5h6Xz4(rdbMuBpayH;L(oN6w*1%kqg-X(0&>H@;V!wgj@Y zSZr0F)%LXApxpU$Y65L~OIg}JUXu&C$vOdP?!qTJPg1)*%Xjd5S^;l*j+xT^t5RW~B6>jHIVxu!kxw044-G>XC+Y!kZifi8)m9XBa$Rc;-JjfoM zG|edES0E{NB6XeVwqpm;egQRvv#mIi>~#~mapy3uxb`ab`=JJR3FjELXY$yyNv(kJ z2a`#<;wIY!ZD zjaEL(QwjTpPqh*VZ$8?jdMp)PJpNXjQ>e;QF6|PZ8)w;1nbby=v4oyX)PJ$}AI(Fv zGHW!xk`yxmzn3Cc+f9CWp-=wqYy_q?srIIfl>1?yCY*h!GT$E$3PfJt)JIk_w-clO z67Z#VvS=}+cdx>3&g+Holyfgs9}IL#jYDxQV8_~D3%EA*WcBzsutm8_dyZ^r!`!GZ zZ#L@OC5&z|@X#{{Myw!{^j_r@%L^Ni=l=PEx!oGF=2e;M+ua;;xt7o4kiTb}3xrMa zPmB4t1c`}IYww_pgw^;7_Ms(c&`uaOuMOVNZ%~ze@3>A`sC>!sCHuSs>wQGXrY%D8 z4cro|-qV#5GX(K+PYuN~2vb1<+f>6>qB<04J6S�~SNBB$IM$O+wG#bFGpM5c^FB+^3d#3q{g5lxsZ3&cD zxzcMxa&Ga4-aCZXL7PO2l5&JH^T&#zrzp~3PyDDsiW~9Wr3CQ|FW|qt{PAOeL6)*e zYFj!%>cYoOj5a=(ShK;hV(=!R)Vi=aGucFLq&rDbAO#{!_CL*uXeu8Z9Z@Y~R7z71 zxAN}0tbA5YwQK&Ev74GZlhHa8(QxS6pof~)A@82pMw?oSMklE)+>->Hp`pC7+Q%); z$}D`j_Y4g60!MQ-7GXYjGz;oy>L9YvU+>oX3YrtD#K5dgVxCeh0&YUpI;TIVtrMH! zI*-K*9`CHfp&jI3D3IaiU>>uPV zU@hAABl9(I@Iyx2RE=axWNSy$E&egkGEerjtp&Xa>nfe`dW3z})y`Z~i+@d|uUe`3 zx3SG#NRIkQ%ZjJ^hRvAWgLmOH-ut@%qFYG%9lS#7o4VDm72xn6+}Yk3k)b zT_6?EUcEX)(`np2C1b_d6~Q7HQ1ct$1vcFHSr^s3(-S?V&7&kF2j7kE2)YS)8HA)P z<*oG5MSDWVMG(g02kVP9@Z=RK}6u*`j^&etkds2!VR&CcQ`xDWWY z*Eq;9aT-(6VlQLcn^^pw+8{;TFu01I!pD`~kJ(;yjlE0lB*n55zqzjWgf**r&_S}u zPl~#cdUEt>U5H!%^gk`@$}T-~^Tw{dJe)c0ojLC)VdA5-3fBq*CPFYZ9N8i)IF53p zxR;FCObJhlixP<|!ipuiogZtIY`_udz|6vUc1AC(U^wnaLDf{&E8?X^H>;j`^+q;E zT+6W3GzKF;!Z&U)06yVkxnpC&KYpQ#WHJHiw3+3Xe!NsU+pOzApotKs0U&c;LB8F4JDm5%j&?R;J}=V6dBfd7+jgJvN+>568raO{`8XJuAFhn^X`bVxu+FZD7+EMO zqfXOfe&)Y%fv~&ox2%%!BF|5%f#u@0#Asnc9e}EjHP?`9^F#HUsr;q#?ye3(?*q^w zPrwVG3nMQeP%mlB@0ddBRhY8IUvi(qAItt~>&|oz%$okpoD=@u-RC!;PhcsKJm0Dj zqSRV%Z&Gew6s<`X_?G0|w<5bM%laVZ>+TK@dQ)DVFDvRzj~J6SM|iEJ-J;w-?E&Rf zyyzeeOlEurD7rduE-NTgfW5#*0BQYj?Z7T{ww@4A%Dg^>2AxT@EOZZP>;49)Y7w0S z`2gF_O(Zv8Z*(o`sufgSLc)UJD5^GFD6IYFWU^M)W%tZ+zH>lufLCGIdGx6d6KG?& z;zt zy@SIGNP!ad40^^9@Pm=p5&-XxZ7sn9{r|dG#Oz6eknEyltL(dTDuS z$|#J+89n{iFe}t!fE& znX~x!Q2j>!EWL(6^R06DguqD08<@-k{Z@^v&I@Nvl@1=~p~QYErRa%CyT<=9v8D!UDKtqpSl0i zLGo+qoM3vXf^z+HwaV-U_26;^nVUR!nKB5rG18?j(23Pb37R3fn z?auIL0YpMMNcei@hS5pYH_+jc61ms4U-3tbvcse^qvB%U5~`zTwPsz)K{5EMVI5>aG_)i86SX01-L|$- zb?Yh*PgqkTgO~l7eJnLj^<*q7rMQ_< zDxIcoIVwH4jPX4rf5p@H*^EJ&X!XD&q|*U!p2hvshxb_NZCv-S2{xIgw=`6UTk=7- zT_fD$0V)Bd)2oFhSzB;YBwO`WU*;&}(jDazkPtg^ONKyh&vkw$$>*VlRel>i{I-C# z6g9-lVWNTEpERzfZIm|DN4c-p4}OhqkL^zkUTjhE4zibBy9lCe(4`bHsUq2D=hBj* zxG@I(G&EZBt36G~HQ_%$rw(6+=RPKB+s@l@j!{_@VXArqSW|HBtvTA$CIA-ajxYS` zpMxQmpJz#dKUiPdB9oH=JJ}LNFEaDksVK@v%uEIR)_<4rD1TNO%|LpDQ+RC61_5Y4 ze~iLCv{OU>vu4r8^@E4{buwhiLs(Td&-4Q_>}$@u$;pOVUmmCRZb#6H-1@MaP@ipv zxui);!BIh|AN;oI+5;9QlD6<35lh6W@aQsn5g=Xbvzc}5PfnpkcA+(Ik3#@ zAW?}-KdgDywl+W;j!GXhB6FzdU#t+#bGu%RKfn8Zz+~U+ecq4LsK2`o?#JJZhoE#70B ztPB3v7e8=xraz*qJ?NtQDG`}x*0%sTVK1nGRSz-d>5t#p@XM(1lxX0U6?TTRa}$`W zWO7>%v+pm+*5WVyn(QT;OB_NtwabAP$62teeV3jPd zrc_x_uXbhA@qi;O&6g-CoMn?#)4Rf%99EXV>i^DI5_KU75Pv zGQ70%e(41gRM5h8;=%2eha}fV2yc7D-xu(3Qb}c4<^op6#KwR!`3m>srln}%-By0& z=kwoyo2%)=#1x>dt(D;(P+j!M+5)4kAjeIu2kD*-aP8|Z{q@1BfQCBjwUnT7ovjU$ zHc2)vbYGTCy&G6&hwpsz?$uFtvR0>fD93@vEXSOf^w?xmQ7)l&tco+sXbRKK@O$-t E09svqlmGw# literal 0 HcmV?d00001 diff --git a/examples/ocr/images/368_fixtures_29232.jpg b/examples/ocr/images/368_fixtures_29232.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7566131c8f1e222be21a4f9dd6f9321705dea617 GIT binary patch literal 9689 zcmbW+cQD-1+bHlaR$aZXUPD--EI5N3LPCUa^LyVr_s-nE?!C{MbN)HcoH?I4Gv~}a=W70H4Pd;d1J?maNC1H3Ujwcd z0WE-vf`XEQoQjf?lA4-|hL)9%77V6^FtaeQ@^bO>@pAF-2#83F2?#-jd3eN?u0v&H z6%-Ws#Z+!7%c)7rE6Dw45fW-@YFaQYCmkK9oFI>&-2Zc2bpuRPfDB0#8Ho@;%0xoO zL~_*+@c-K_*GcYJPBs47iQB3UfxcG#`By?6b2AlIDH?O#)w5+_M zvg!@4v8lNQPiSrH?du;Hd_OciGC4IpGdnlGKwRJ0{Jgcj^JRDM$IxfYZb50uQf= zp!uRtQr_DQxRjfHQ2#x-5xl(Z-D7!bgH0?m%Z^jgjj1F6>&0;0!s^iB!DHWwnw-E< z8G?~$L2`{HSjJTDxk(giT+o%4%p7TGy3_lIt?;=+u~x+{UfR3%MpOIXJ9Z_O+rr_d z&MEt=f-0l5QG~PSRIDRo_rVQrzNU||%!7;D9VTmwo2HS*X?L+V1btketKpBz*bk1f zk7s2bYBJp?5mwD6oAJ7HDRKoI-M9jZu7LLy7cozaexIJd+IzS2>if_4p2zKf6@3w1 zJtaipMA6(SqdaOi0(L#aX6GPez`tpD;@8TXH|ikJ%oXtR(G}48;0i#Alf5fWx(7VO zq!%shKBCL#gG}9D-Cxv8)O?}JL$GuNOvgSA-BQaZ{ zC;#uCS3pM2@D;#!1w8%hdvSazP55Tb`=yAq;X-NsF?X+Z5SEMg)q<3B_So=tM{WQh zCJp~ma#WkA@ozED_>0liA4Y%UFK^x1u)6|ouLhpl1zsZluC_0hg{PI#SJYW+mD`Og z1XvB6a3@NGr809QR=oWT(48w|sAXAmM8qF1u7q7a`LVainQfq%3EdnoSj*4PZs#QRPg%c)Gsar@i7q>&}hVTvq|?U5(Fnnok!W{y-NyuXhb9g=;A zGOWzt8S+=&a=`4sP7QP1Q zgG_owGZNj%m7wmQQ^=ZR>~alCqFQMewfyNbeejPA?&Bo&*@fA6X{9I-dh^yU##MyR z4faZ8&2o5_4^EMPp1kyOFW&*(J(4Sicbd`q(rq1(`z0;*MwX^y07*~k2sKjToS_n* zZHHge5&OKp682)88+>|bz`4}?L+g}G0-)9-nC5N{Wmb>(f1Lw zNMye3{LKJGMP++KPjS(6HSEb`lO&I)SolZDLsUOTYdbt<$8kL7ort(X-G0r_YO9~W z=3LTL7p%!M5bgaYnLS4GV|vTT2ssSrC?BgIa6{(lypS z>iUW=R=dUMwP4(=McBhpvsSAbVJs3^MSFZ`ZaZRLe>l`oHDAOLc61L-a8(RZMecU! zRsPTlmCiE}YQrvzqQ3|_LU@PF9RtRYAoOk5Gl=ZLWO)~dsBr6+BxWn9bmVi99=w@o47f&OH(`&}XM!V!EkZgh$G-cDZn=n8e)+VIqR!3k&VF2Xpn9U`qsQd#5d9H= zlGk#p#c|qtWL8h487x#9+cBs_@T1)aO)@Mu5YtMZg4_}5hdrGYN(DaBK4NI+hBOE> z`E)r39_gT2FQVT`^Q_<%Y`GK*khWAj>gzLd@j!`wiKOc|aqgHl|wF9x_J7GpI=8k^oiLI5{Eh-q`ht%+LP{6eb4>?fh% zM)Z$O-QYOwjOrp~@fsRw4v7ko^xsjw)Hl3N*3pvaN<4>-RS|+`1RaZbV%05So~zZ+ zA1+5RB-u?)6?86j|3sA){`@D?i*M4xQLRyuI;00LLv$%aLSl|dFdOCho;fD>eV}(5 z%dj7Yv1XHFcqN*NCwEi9l60e-eaCTPek)X+>@Nd-Z;cQ944Xl@JQex^S2Y+KSRe{5 z>A|Xj)?pxZbh^*i--~2=Uu2pnez=KU;cb~%7tLnKbry+pd}GIrGZ=ef3}&QJ!sj`+ z6?9}FxU~sEPET(YQG>rFlajBt`{+lrUi6>d6)O5c-Z7U~u?Vek%4EMGx1l0@erwub zeQ~r97TCkopiI0UO4~9S>O5Ol*3s15=5_8Ny}>*Ix%<>PvyqFCIb>W!0C;O#ogB7( zM9(l&-toC*%gBYr-I2*9ZIKG*!KdMTc3Ya3kx7BkNhIV*@8{~Uyu+_fO+-r!t+uPX zle~cs8E+{%*4NM2g=dg{+Z6(;{>8ocmNs#%sfiFl;G%L|Xw^)Zl&hYjkA*!kjvsR+ z6CA3n-9jd3x{F4<^iP=dQcyA-+>mK^Tg?_83#GcxX`kLUb{9`{Ocfe6A*4GkBp#r_ zp}E%IRrIW7USf^eE)4=gV!1;yDoJ$o-MyuK9FSkbp!*J>dU9)WCmbbnKPX{9hBjsY9TV8h zP;jtp%&4F}pbU`Cf1rE*ZC=qLv$tY?D8{XD;GO_YiRdwk1t`sE_y)u+mANd5YSXF9%Sfj#2ZgHTZ{S*f|!tVrrV#wk3C9S`JA8&aOkY>)kRn2Byfr zb(gCa?O~-yMaPH*=OkV6dJI!qwSsdc^xj$t&~5||M3p~m@QC~xq34PY-wqo|bavR< zwuDd(cN`hVu3Npf$!lRh*$!!(KFHlhuZ~6R@>;1;7~_0mS&vR??s5BHgP9PV$xM36 z_ChT~Ee@a`M|tYu%2-znm~&HtP3PzefGw%T8U3Om%@}YeP0#T~ZSXhjDaDDx6Zo_% zk?mMNT6gdVc9K zhTl8K#6bi44z>&O8Eqmt77}-?F(d2vbfGOI92R~iKYy;%L0(jXXRA>l_o-QE&62u3Eqy$Ke>!^Nh{nHG_E z`-6h98WPNYq!?u=iOK{Zuci?kTDB|O8JBaCYfw$wI8oQ~y(a12Qb$#WU`k0MYt&K< zhFialxWVmlr`c3LqW$F})n?kIsuR_@YU-DU66TS2uuTHm>d24HGURW?N5RJESt<^* z)@QcW*A{hnNrI+wT7%#>{I(R$Gu_G3y#AR687KuZ2c*ulZ4)YC-gqxIaslTa2 z%))8J1t1J!Xmus~7Nwgm2O|r$C-X$bXBZy6@uPLjGFwhQ2HFawY*$+$euvDSn-LmY z(tG^}zh419R(lrpf3YVEpU}6{oQ&aq5N_@mN{fwZ5j)S;e`(Bc#Tr>}_cb^Nbn5H{ z1}p>WX~6VGeH8y(Uz%*$6-$18_oXg(Xl4>{aKW$gN^`96dQATmv<7FVUjK%yhl*XA zf`k%nq%uK8_Cc6KbGf#7=9(cF=in<8#VZH$&d$9iy<;2B kg;gDzyiMj<*FTlvW zY>%raYM3<;r=;R6^{ULxh!JbS0CL|Nc1|UY97vYkY;}a+8om}K|Lb{#mz254OtX8P z+hqC74xzavZuZb0)kHN;)Zy?Q156U{Ibn3d$p4r?qER_yAFHz%}GCiX?TRg{BCb%p=#daiC3u5 zS`qdiIvfY=hc2VxS!(=yW0PcUC;}+_c`H!pPSzebII(iHeJH6X+}5^ul8Bb!vn>I) z#!Efx{%~*Td+3GG74SV>`wF0npMTn@d}+q+gxg|})F zCG8OUY*M9}(V39OApbdmT@)=qib+{}mfC%LFETaUd&rrP&tcdn$1(tze#cfoe8w2eO%()z^XuQ!@nJ>`}f6c-7pzm4>f7-dId-D{uGBc!x?+r69ykJn)0bQ z=ZXmtKXy1j#=y^OIGzf6UQ5wl!}Rr;UUw$+O)c529tG zu6+#pT8$+5e4Z(4!#5(^+r_(Pgu(-qlVpTj2Itq4;Fp9Bcr1S2V-jGI0}+ z;Z@g=RZ?}9a6;Uaaaba}ZQNgus(h zd}6YfotO-Ot^HtLj0yKSQ99t6T!sc%>zJ@IMhWXh#Umh0pEdv&yf`PsXKL_-{pOS0 z5-ZWv6k>3!*hmurQR>(5A|u@8zJ}Fk!Fv(LuF|=eR-Sp0)@a1T@!Kb@zA{6gfjqs7 zR(-AJMgJYD5x783=%6~qSWalpAJjWc1&i;kog9%2JwA<42<_w+0ENfOi>El3tf9l- zK6N^&p5KU`oR|(zW?SyjN#CDx_`HzFc<{UtPZ_|l+_k#y@k>fyKkt`)a{sAR*x~mD zcrd+-rL%LEb28pRdW|U!mRf~*V7!O(kX#RY9e1eMWQpE02@Fe_4a6~Rp{xoGq9cONC(#{nX=ANU2gh4AODK%}{3>W)>yt>8CPC4Ay*sW2 zD#)$44b=GyAt<$wHl9;FMVL4+czJ)UMZ2qnEz5{*C1cm~Ez7(%Pa1kzp56505t2J5 z^TL2jz)pmHq3s=)VhUHtOaDqO5mO@}KHBv9QhPO%XdK&Zhuvra$qWr5Nx|c?Woz!J zlaF$C<$u+?SKSkoGX{^;-X1qX(N3NjBeKJ1i6$uj3UKnv_9$;3+w7%11|dR$W0opu z#|xtQsbv0E0vKam@s#&4r?$2wC!$irmt|Na!>Z>b9MffPCODsu&!>uHjGX2q@ymIu zK!qax+j6urJA>LtLe#w)mmye!2|5g;+pb#2LXCQ4J_0^K;iy59EvMb;p7C&mYdp*1 zo=DkZYoEANZz4%Hhb0lr{A7)~oz~!wLcsV5Qgv0ZqA_knBs{W_(-;(?wMosjBk?q- zNG=oGO)ylm5nfqQ;EL#WY)#*PZKjTBk!wn+14~q2w5JI>;F&fGf={Hp?iWtZZVYS% zE4}iNqoCyo7);XCZ8MVn-oI9r!P5#crnCiGKEhsmcqi=zbHDV^NELKRG~b$_9^5vz z3{=grsZ{NoI1NB760Ia2+U%X+60yp4kZYl&hKH8WYmzB#C9sI-@!Cu%;=(9TE!s)0<-9R~9`$&7KEJQ{a zRhAoybwzxCP!AM~p;6j41}8OhZ~bt(30BloALi4TY=6DPwSakD6*L$gv{Pics6#&Rx@cZz@nsfT9s@}TjQ4|Yy#8q7n1?$IKmPYUn z^%@B+?S_Of&Ubb&cP2vvzujeaxfsI4OOPZY^5cj$k)I`be;|!2wSrwoz1Z^wEReGP ztUB(rI0l$Mk9S$a@ly(!hJ)eJ4DcPd&aP=GXRPhX$MsYK^L=R%nnf*P9cwtE#9>=d z)yW{TByW@7w zke0+smy(bdXh*5jka**Gf1XCtfFi}W5+mE(%0D^RJC<2-Jzf#svMi(tZ;WvlDSQ@Z zql~o~07VqD7JH0uM0jZAoyI&_?&dC6%m}LoQ~z)JqE@pJHtvsC0ED7oAoj3)RvHB@ z6L$?Wk5i}Bm+#5_lRsDr;qY?GYCFuT(B(b&%)YBI_fbpOCX*pIkR(JC5HyB&P@u*~ zUBZamL0*rYYL-8^`B&Xe7j68ovtX?wif~?rL}t7hsR=wP)MD@H9ssnVFvnYBeO?Oq z1T3CuX-Su%FqO-94J8}`q)xNBnr5cI%r~q=+7dlXlf8;@i!~_W8i#|}ZgwW9$Pn9T zhs);f5#^wNqtK7p*^QB-qd$r`#FIrRQ=;h#gtW&0jUJe8prluh*~BE7!lr1dt21eK z8$9aKK{#+uTK&uEONgV}8FP}vaED6pPPCgRIc<5@gHG7-p*aus*h66BL5?~}8&nR# zv1+|4aC5mJFpOe@W=owijIA!{_eRo+s!;=Rv4(?KjXT=@LpB+aRC$JXHEkHYSI_#cvQvzN=wW4@#(2|t-BG&s;u0ccwt1`*G zl&wdZ*}0C>9qNwLZecq~+ZOIG7BokU?b}|-oK?R>oAq((8{ z3r{HxmuNNS51FjANzc@o^s_TW>vOcWPw{c@&JJ{30gdp4BGkxaYQx*Zr38AaLF5ye zUKw>Ee1VYF8hTLG_TGv}>p7IR%OIV{QJt4H767I8F%jO9JQIf82(K8A2paY!Ke+g@ zwA7@eD#|2dA?WBBi=O0=cqhKEB@5bHw5cs#=tUJMoCLKO{%BNY%oO%3OGGf{LP|}e zTpdXO*X6Q-m&ki_9PImzHw|vsw^`9_ZH7?Du&MRc!)zpZluS#FDcdetA5_1;0@ek8 zZ(RYJuJQlsz&Td{TAa-5LBbb2zZV93Il&A2gB(34%qZTfAMJR zjiPh6f$K?utk?Y3yVRZN1u5%;6i+jbar}$+CqV)`{L0@39(j}&F-YXOsuxirE75vt zM+dK@lGu&qQ&_kHR5sJftHl?E8g@W@+M%^Y1w#L7+YR31KPTG~SlWC*PN-S&-CNEADOha~TW$~+_#hGv4hwD1N zg2AF#r6fP6A$jlKit(ftkrT(UK(9T%0EWfvhX zFLhQr$eIcP;lx7a5*7m-27-1V?c08b5Sj0|?gs6r-}jtZ-mqP58W|{Ff?enA{bR~| z1KB`4&9lwQ<#l{6*i{(p0g3<9+w!j6us!X#TYxO(v!~JeaI>VU$9`>XQx*NUn{@XU z59n2Tk6HIhUf)8h@KOmysoY@qz~&X~|G}ijGE{oPK6%J5+uB0zG&&Z#;4PEr+w=8r z2)w!6ZEvJ}zS%V=wcepQm(!rZ^-xY1KjwguuK*O}*j#l@n&V%C=zA`%b6a`btZS~+ zuGnk4)v+)A^KV0sk282%hJ_LY(rkSqGd54j9n$=^;q2tn?!@v!gwN~>Sh(Xe_)Xqz z9RK@dllvsFB}!SZ_Satr2R!=Y8A<$vfH@go>Axpz?Y)bCh4gDw1RDVhPTz0eM&#Pg z?VmD2Ib|7b;-oskKIVNcZ9OFmM9-6)Kcg!v!=(d?V_XCK8QQ${!PFwox{Y@v4o3w6 zx9tyXAFWmvJs&-8{+nOuQacS-xY3k4)Xk=>mJ)sQBjqiJ@?qZ!yQRIh)NJ?4bsvX% zFD>6jzi@{9BT@=EYHmOIV^Z-?KfSy#H)mcYY|oatCs{4oD3~s)_dUwdTW5Oh-FV%% zGo{j=x~}w-IuApC`|iHu)ObGbeUOFY3(X(Hr>r`A72!(LY(y0 z@tT`SFMl59c_*!Bb$K}%UFF5$zkopM`c(UZu*NSvP}0lGBiCJ?KM2_v-F6D!fs910 zN+lO){%ZCzL4mcrgH^*eZ#PkqH{lLh&V z2Bk$mrbcc()Tz(sDwD#V1wA5ZUFoq7{(&vRYO$vxT)G7Z4<)UeKW0i1c5wp+4;f($ z`EYNHJ!ho0OL@P-f^|p>L1un{Whemj{2~7u%RXZuUf@~lv$fX9ewX-=FM#B>ze$bn)fHQillZIx5O7gT1^KY{5N!_OzcZ z8(8qZL5=oLP>e`6l<=}T*yKtvC8IBM{Vav6bnV`QXnjjrPud4DllnWJ_n+i-rGEZ0 zIZF50A7UZ1E5yR15;D=tit&(by6A85^Q-w%f+Y@0s)Ca9bFKBr6w&7 z;M1g_X|%^ZcOsCM%22jDlRC(C{k8(TF=3kJhHa;9HvF^Qz=yyZ5}K338d1kTYztb^XxT0VZAtXyS^@t~9%&e+eRFhQT5M^Ur-1`iN1}E} zAHenu6vwKF#7;FgKRuwT;E;P6-;#lqy7qloXt2jB;|po^vp$LHdPglgs~E#ivk_sA z5sAZyXSiU-+{oTSN4g0!Hd;=OjawxUa_lC2T6-og`V?)wxM-*Drf)yqqHdw6S72LD z1M|N|r2*-K9dy$j8_eOYU9g#;NvkEeTF%W_saLnX6Y}K9fPS2#5)e(L7Hxh|J+8)1 zrU~<#XjodgDa;+_4KXK?kk5B;Hf6*}3AgqLP!jJSjaErkCEne4&CR#zkQ1cAsrvN0 z%KXF2MLFnbLFaNT8T{%~HKf-ZIZ*vqK!|$fIWU{TA=bsCUbsM-a_Yv=ck?bI= ztU&gX7E@Ek10Nm8u7*OjVng`r4*lPweIS}2;ngQEiYb(jU`p8m?SAUS{_lnl@$I%c zbJmSbUAXDm8^U>BuNwC0-zd{v);2GB;tqSWa1O2T#s9-f}NyGD#v*~X2&hF zx#`$eJ+~nR<_Txt>q>drn7p*7rKE1uLDyd$7OhJH zs+2>u&VJq0Smy$(UMXzMDkII4@x&8&T!fFK{`vR5>SCBE6Yg?f?()d0U$xZu4e1)2 zbSs(r=53^$*t3HJ>BE8UcS%$RerlO_A6~;aGgDfiUX#Ef38t{m>qr;5Kf7;hEumWHbh@4yg+_Yb4MaJQ=A6A@eky{|(yq?9ouH-jL8r~R~<#u!C%+xi{c<8CPS z#Mp6#iguLN#D+|5yeVPZurzYQj6Sgbwu~$;^$Iw-XN~;8WrJ8)>J!SOdoRy`Rj?mZ zqFQj$>yfBVq~F;qcdvV(KljVYziP1?`pq?SWvPleCtF@;ue9lzu#R#hQ;soUyGLT@ z-De=>9~_XwUdg8)Y<}%J=a#5Nrn&@4R>shi(+T}{a5T75Q3%VxBdbB{ukGd~*_-n3 z<4vN@Ipr)pj#kO>AtZ(Zo`qxK@C zd#`IyKX)w;PE4pieI`Ua{Tbc9>_|UHMSQ>|WHWOC{iMT#JZ4?!bgh*=Kj8C1{e;gR( iTO6yN=5WM{YpV$yc{XxUY(vP{p)X}D(+8ejE&LB|8vB$0 literal 0 HcmV?d00001 diff --git a/examples/ocr/predict.py b/examples/ocr/predict.py new file mode 100644 index 0000000..d1a66f7 --- /dev/null +++ b/examples/ocr/predict.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import sys +import random +import numpy as np + +import argparse +import functools +from PIL import Image + +import paddle.fluid.profiler as profiler +import paddle.fluid as fluid + +from hapi.model import Input, set_device +from hapi.datasets.folder import ImageFolder + +from utility import add_arguments, print_arguments +from utility import postprocess, index2word +from seq2seq_attn import Seq2SeqAttInferModel, WeightCrossEntropy +import data + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 1, "Minibatch size.") +add_arg('image_path', str, None, "The directory of images to be used for test.") +add_arg('init_model', str, None, "The init model file of directory.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +# model hyper paramters +add_arg('encoder_size', int, 200, "Encoder size.") +add_arg('decoder_size', int, 128, "Decoder size.") +add_arg('embedding_dim', int, 128, "Word vector dim.") +add_arg('num_classes', int, 95, "Number classes.") +add_arg('beam_size', int, 3, "Beam size for beam search.") +add_arg('dynamic', bool, False, "Whether to use dygraph.") +# yapf: enable + + +def main(FLAGS): + device = set_device("gpu" if FLAGS.use_gpu else "cpu") + fluid.enable_dygraph(device) if FLAGS.dynamic else None + model = Seq2SeqAttInferModel( + encoder_size=FLAGS.encoder_size, + decoder_size=FLAGS.decoder_size, + emb_dim=FLAGS.embedding_dim, + num_classes=FLAGS.num_classes, + beam_size=FLAGS.beam_size) + + inputs = [Input([None, 1, 48, 384], "float32", name="pixel"), ] + + model.prepare(inputs=inputs, device=device) + model.load(FLAGS.init_model) + + fn = lambda p: Image.open(p).convert('L') + test_dataset = ImageFolder(FLAGS.image_path, loader=fn) + test_collate_fn = data.BatchCompose([data.Resize(), data.Normalize()]) + test_loader = fluid.io.DataLoader( + test_dataset, + places=device, + num_workers=0, + return_list=True, + collate_fn=test_collate_fn) + + samples = test_dataset.samples + #outputs = model.predict(test_loader) + ins_id = 0 + for image, in test_loader: + image = image if FLAGS.dynamic else image[0] + pred = model.test_batch([image])[0] + pred = pred[:, :, np.newaxis] if len(pred.shape) == 2 else pred + pred = np.transpose(pred, [0, 2, 1]) + for ins in pred: + impath = samples[ins_id] + ins_id += 1 + print('Image {}: {}'.format(ins_id, impath)) + for beam_idx, beam in enumerate(ins): + id_list = postprocess(beam) + word_list = index2word(id_list) + sequence = "".join(word_list) + print('{}: {}'.format(beam_idx, sequence)) + + +if __name__ == '__main__': + FLAGS = parser.parse_args() + print_arguments(FLAGS) + main(FLAGS) diff --git a/examples/ocr/seq2seq_attn.py b/examples/ocr/seq2seq_attn.py new file mode 100644 index 0000000..5e5cd42 --- /dev/null +++ b/examples/ocr/seq2seq_attn.py @@ -0,0 +1,333 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.layers import BeamSearchDecoder + +from hapi.text import RNNCell, RNN, DynamicDecode +from hapi.model import Model, Loss + + +class ConvBNPool(fluid.dygraph.Layer): + def __init__(self, + in_ch, + out_ch, + act="relu", + is_test=False, + pool=True, + use_cudnn=True): + super(ConvBNPool, self).__init__() + self.pool = pool + + filter_size = 3 + std = (2.0 / (filter_size**2 * in_ch))**0.5 + param_0 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, std)) + + std = (2.0 / (filter_size**2 * out_ch))**0.5 + param_1 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, std)) + + self.conv0 = fluid.dygraph.Conv2D( + in_ch, + out_ch, + 3, + padding=1, + param_attr=param_0, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn0 = fluid.dygraph.BatchNorm(out_ch, act=act) + self.conv1 = fluid.dygraph.Conv2D( + out_ch, + out_ch, + filter_size=3, + padding=1, + param_attr=param_1, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn1 = fluid.dygraph.BatchNorm(out_ch, act=act) + + if self.pool: + self.pool = fluid.dygraph.Pool2D( + pool_size=2, + pool_type='max', + pool_stride=2, + use_cudnn=use_cudnn, + ceil_mode=True) + + def forward(self, inputs): + out = self.conv0(inputs) + out = self.bn0(out) + out = self.conv1(out) + out = self.bn1(out) + if self.pool: + out = self.pool(out) + return out + + +class CNN(fluid.dygraph.Layer): + def __init__(self, in_ch=1, is_test=False): + super(CNN, self).__init__() + self.conv_bn1 = ConvBNPool(in_ch, 16) + self.conv_bn2 = ConvBNPool(16, 32) + self.conv_bn3 = ConvBNPool(32, 64) + self.conv_bn4 = ConvBNPool(64, 128, pool=False) + + def forward(self, inputs): + conv = self.conv_bn1(inputs) + conv = self.conv_bn2(conv) + conv = self.conv_bn3(conv) + conv = self.conv_bn4(conv) + return conv + + +class GRUCell(RNNCell): + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + gate_activation='sigmoid', + candidate_activation='tanh', + origin_mode=False): + super(GRUCell, self).__init__() + self.hidden_size = hidden_size + self.fc_layer = fluid.dygraph.Linear( + input_size, + hidden_size * 3, + param_attr=param_attr, + bias_attr=False) + + self.gru_unit = fluid.dygraph.GRUUnit( + hidden_size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + def forward(self, inputs, states): + # step_outputs, new_states = cell(step_inputs, states) + # for GRUCell, `step_outputs` and `new_states` both are hidden + x = self.fc_layer(inputs) + hidden, _, _ = self.gru_unit(x, states) + return hidden, hidden + + @property + def state_shape(self): + return [self.hidden_size] + + +class Encoder(fluid.dygraph.Layer): + def __init__( + self, + in_channel=1, + rnn_hidden_size=200, + decoder_size=128, + is_test=False, ): + super(Encoder, self).__init__() + self.rnn_hidden_size = rnn_hidden_size + + self.backbone = CNN(in_ch=in_channel, is_test=is_test) + + para_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02)) + bias_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) + self.gru_fwd = RNN(cell=GRUCell( + input_size=128 * 6, + hidden_size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu'), + is_reverse=False, + time_major=False) + self.gru_bwd = RNN(cell=GRUCell( + input_size=128 * 6, + hidden_size=rnn_hidden_size, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu'), + is_reverse=True, + time_major=False) + self.encoded_proj_fc = fluid.dygraph.Linear( + rnn_hidden_size * 2, decoder_size, bias_attr=False) + + def forward(self, inputs): + conv_features = self.backbone(inputs) + conv_features = fluid.layers.transpose( + conv_features, perm=[0, 3, 1, 2]) + + n, w, c, h = conv_features.shape + seq_feature = fluid.layers.reshape(conv_features, [0, -1, c * h]) + + gru_fwd, _ = self.gru_fwd(seq_feature) + gru_bwd, _ = self.gru_bwd(seq_feature) + + encoded_vector = fluid.layers.concat(input=[gru_fwd, gru_bwd], axis=2) + encoded_proj = self.encoded_proj_fc(encoded_vector) + return gru_bwd, encoded_vector, encoded_proj + + +class Attention(fluid.dygraph.Layer): + """ + Neural Machine Translation by Jointly Learning to Align and Translate. + https://arxiv.org/abs/1409.0473 + """ + + def __init__(self, decoder_size): + super(Attention, self).__init__() + self.fc1 = fluid.dygraph.Linear( + decoder_size, decoder_size, bias_attr=False) + self.fc2 = fluid.dygraph.Linear(decoder_size, 1, bias_attr=False) + + def forward(self, encoder_vec, encoder_proj, decoder_state): + # alignment model, single-layer multilayer perceptron + decoder_state = self.fc1(decoder_state) + decoder_state = fluid.layers.unsqueeze(decoder_state, [1]) + + e = fluid.layers.elementwise_add(encoder_proj, decoder_state) + e = fluid.layers.tanh(e) + + att_scores = self.fc2(e) + att_scores = fluid.layers.squeeze(att_scores, [2]) + att_scores = fluid.layers.softmax(att_scores) + + context = fluid.layers.elementwise_mul( + x=encoder_vec, y=att_scores, axis=0) + context = fluid.layers.reduce_sum(context, dim=1) + return context + + +class DecoderCell(RNNCell): + def __init__(self, encoder_size=200, decoder_size=128): + super(DecoderCell, self).__init__() + self.attention = Attention(decoder_size) + self.gru_cell = GRUCell( + input_size=encoder_size * 2 + decoder_size, + hidden_size=decoder_size) + + def forward(self, current_word, states, encoder_vec, encoder_proj): + context = self.attention(encoder_vec, encoder_proj, states) + decoder_inputs = fluid.layers.concat([current_word, context], axis=1) + hidden, _ = self.gru_cell(decoder_inputs, states) + return hidden, hidden + + +class Decoder(fluid.dygraph.Layer): + def __init__(self, num_classes, emb_dim, encoder_size, decoder_size): + super(Decoder, self).__init__() + self.decoder_attention = RNN(DecoderCell(encoder_size, decoder_size)) + self.fc = fluid.dygraph.Linear( + decoder_size, num_classes + 2, act='softmax') + + def forward(self, target, initial_states, encoder_vec, encoder_proj): + out, _ = self.decoder_attention( + target, + initial_states=initial_states, + encoder_vec=encoder_vec, + encoder_proj=encoder_proj) + pred = self.fc(out) + return pred + + +class Seq2SeqAttModel(Model): + def __init__( + self, + in_channle=1, + encoder_size=200, + decoder_size=128, + emb_dim=128, + num_classes=None, ): + super(Seq2SeqAttModel, self).__init__() + self.encoder = Encoder(in_channle, encoder_size, decoder_size) + self.fc = fluid.dygraph.Linear( + input_dim=encoder_size, + output_dim=decoder_size, + bias_attr=False, + act='relu') + self.embedding = fluid.dygraph.Embedding( + [num_classes + 2, emb_dim], dtype='float32') + self.decoder = Decoder(num_classes, emb_dim, encoder_size, + decoder_size) + + def forward(self, inputs, target): + gru_backward, encoded_vector, encoded_proj = self.encoder(inputs) + decoder_boot = self.fc(gru_backward[:, 0]) + trg_embedding = self.embedding(target) + prediction = self.decoder(trg_embedding, decoder_boot, encoded_vector, + encoded_proj) + return prediction + + +class Seq2SeqAttInferModel(Seq2SeqAttModel): + def __init__( + self, + in_channle=1, + encoder_size=200, + decoder_size=128, + emb_dim=128, + num_classes=None, + beam_size=0, + bos_id=0, + eos_id=1, + max_out_len=20, ): + super(Seq2SeqAttInferModel, self).__init__( + in_channle, encoder_size, decoder_size, emb_dim, num_classes) + self.beam_size = beam_size + # dynamic decoder for inference + decoder = BeamSearchDecoder( + self.decoder.decoder_attention.cell, + start_token=bos_id, + end_token=eos_id, + beam_size=beam_size, + embedding_fn=self.embedding, + output_fn=self.decoder.fc) + self.infer_decoder = DynamicDecode( + decoder, max_step_num=max_out_len, is_test=True) + + def forward(self, inputs, *args): + gru_backward, encoded_vector, encoded_proj = self.encoder(inputs) + decoder_boot = self.fc(gru_backward[:, 0]) + + if self.beam_size: + # Tile the batch dimension with beam_size + encoded_vector = BeamSearchDecoder.tile_beam_merge_with_batch( + encoded_vector, self.beam_size) + encoded_proj = BeamSearchDecoder.tile_beam_merge_with_batch( + encoded_proj, self.beam_size) + # dynamic decoding with beam search + rs, _ = self.infer_decoder( + inits=decoder_boot, + encoder_vec=encoded_vector, + encoder_proj=encoded_proj) + return rs + + +class WeightCrossEntropy(Loss): + def __init__(self): + super(WeightCrossEntropy, self).__init__(average=False) + + def forward(self, outputs, labels): + predict, (label, mask) = outputs[0], labels + loss = layers.cross_entropy(predict, label=label) + loss = layers.elementwise_mul(loss, mask, axis=0) + loss = layers.reduce_sum(loss) + return loss diff --git a/examples/ocr/train.py b/examples/ocr/train.py new file mode 100644 index 0000000..789edca --- /dev/null +++ b/examples/ocr/train.py @@ -0,0 +1,137 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import sys +import random +import numpy as np + +import argparse +import functools + +import paddle.fluid.profiler as profiler +import paddle.fluid as fluid + +from hapi.model import Input, set_device + +from utility import add_arguments, print_arguments +from utility import SeqAccuracy, MyProgBarLogger +from seq2seq_attn import Seq2SeqAttModel, WeightCrossEntropy +import data + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('epoch', int, 30, "Epoch number.") +add_arg('num_workers', int, 0, "workers number.") +add_arg('lr', float, 0.001, "Learning rate.") +add_arg('lr_decay_strategy', str, "", "Learning rate decay strategy.") +add_arg('checkpoint_path', str, "checkpoint", "The directory the model to be saved to.") +add_arg('train_images', str, None, "The directory of images to be used for training.") +add_arg('train_list', str, None, "The list file of images to be used for training.") +add_arg('test_images', str, None, "The directory of images to be used for test.") +add_arg('test_list', str, None, "The list file of images to be used for training.") +add_arg('resume_path', str, None, "The init model file of directory.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +# model hyper paramters +add_arg('encoder_size', int, 200, "Encoder size.") +add_arg('decoder_size', int, 128, "Decoder size.") +add_arg('embedding_dim', int, 128, "Word vector dim.") +add_arg('num_classes', int, 95, "Number classes.") +add_arg('gradient_clip', float, 5.0, "Gradient clip value.") +add_arg('dynamic', bool, False, "Whether to use dygraph.") +# yapf: enable + + +def main(FLAGS): + device = set_device("gpu" if FLAGS.use_gpu else "cpu") + fluid.enable_dygraph(device) if FLAGS.dynamic else None + + model = Seq2SeqAttModel( + encoder_size=FLAGS.encoder_size, + decoder_size=FLAGS.decoder_size, + emb_dim=FLAGS.embedding_dim, + num_classes=FLAGS.num_classes) + + lr = FLAGS.lr + if FLAGS.lr_decay_strategy == "piecewise_decay": + learning_rate = fluid.layers.piecewise_decay( + [200000, 250000], [lr, lr * 0.1, lr * 0.01]) + else: + learning_rate = lr + grad_clip = fluid.clip.GradientClipByGlobalNorm(FLAGS.gradient_clip) + optimizer = fluid.optimizer.Adam( + learning_rate=learning_rate, + parameter_list=model.parameters(), + grad_clip=grad_clip) + + # yapf: disable + inputs = [ + Input([None,1,48,384], "float32", name="pixel"), + Input([None, None], "int64", name="label_in"), + ] + labels = [ + Input([None, None], "int64", name="label_out"), + Input([None, None], "float32", name="mask"), + ] + # yapf: enable + + model.prepare( + optimizer, + WeightCrossEntropy(), + SeqAccuracy(), + inputs=inputs, + labels=labels) + + train_dataset = data.train() + train_collate_fn = data.BatchCompose( + [data.Resize(), data.Normalize(), data.PadTarget()]) + train_sampler = data.MyBatchSampler( + train_dataset, batch_size=FLAGS.batch_size, shuffle=True) + train_loader = fluid.io.DataLoader( + train_dataset, + batch_sampler=train_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=train_collate_fn) + test_dataset = data.test() + test_collate_fn = data.BatchCompose( + [data.Resize(), data.Normalize(), data.PadTarget()]) + test_sampler = data.MyBatchSampler( + test_dataset, + batch_size=FLAGS.batch_size, + drop_last=False, + shuffle=False) + test_loader = fluid.io.DataLoader( + test_dataset, + batch_sampler=test_sampler, + places=device, + num_workers=0, + return_list=True, + collate_fn=test_collate_fn) + + model.fit(train_data=train_loader, + eval_data=test_loader, + epochs=FLAGS.epoch, + save_dir=FLAGS.checkpoint_path, + callbacks=[MyProgBarLogger(10, 2, FLAGS.batch_size)]) + + +if __name__ == '__main__': + FLAGS = parser.parse_args() + print_arguments(FLAGS) + main(FLAGS) diff --git a/examples/ocr/utility.py b/examples/ocr/utility.py new file mode 100644 index 0000000..c64547b --- /dev/null +++ b/examples/ocr/utility.py @@ -0,0 +1,186 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +import paddle.fluid as fluid +import six + +from hapi.metrics import Metric +from hapi.callbacks import ProgBarLogger + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(six.iteritems(vars(args))): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +class SeqAccuracy(Metric): + def __init__(self, name=None, *args, **kwargs): + super(SeqAccuracy, self).__init__(*args, **kwargs) + self._name = 'seq_acc' + self.reset() + + def add_metric_op(self, output, label, mask, *args, **kwargs): + pred = fluid.layers.flatten(output, axis=2) + score, topk = fluid.layers.topk(pred, 1) + return topk, label, mask + + def update(self, topk, label, mask, *args, **kwargs): + topk = topk.reshape(label.shape[0], -1) + seq_len = np.sum(mask, -1) + acc = 0 + for i in range(label.shape[0]): + l = int(seq_len[i] - 1) + pred = topk[i][:l - 1] + ref = label[i][:l - 1] + if np.array_equal(pred, ref): + self.total += 1 + acc += 1 + self.count += 1 + return float(acc) / label.shape[0] + + def reset(self): + self.total = 0. + self.count = 0. + + def accumulate(self): + return float(self.total) / self.count + + def name(self): + return self._name + + +class MyProgBarLogger(ProgBarLogger): + def __init__(self, log_freq=1, verbose=2, train_bs=None, eval_bs=None): + super(MyProgBarLogger, self).__init__(log_freq, verbose) + self.train_bs = train_bs + self.eval_bs = eval_bs if eval_bs else train_bs + + def on_train_batch_end(self, step, logs=None): + logs = logs or {} + logs['loss'] = [l / self.train_bs for l in logs['loss']] + super(MyProgBarLogger, self).on_train_batch_end(step, logs) + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + logs['loss'] = [l / self.train_bs for l in logs['loss']] + super(MyProgBarLogger, self).on_epoch_end(epoch, logs) + + def on_eval_batch_end(self, step, logs=None): + logs = logs or {} + logs['loss'] = [l / self.eval_bs for l in logs['loss']] + super(MyProgBarLogger, self).on_eval_batch_end(step, logs) + + def on_eval_end(self, logs=None): + logs = logs or {} + logs['loss'] = [l / self.eval_bs for l in logs['loss']] + super(MyProgBarLogger, self).on_eval_end(logs) + + +def index2word(ids): + return [chr(int(k + 33)) for k in ids] + + +def postprocess(seq, bos_idx=0, eos_idx=1): + if type(seq) is np.ndarray: + seq = seq.tolist() + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = [ + idx for idx in seq[:eos_pos + 1] if idx != bos_idx and idx != eos_idx + ] + return seq + + +class SeqBeamAccuracy(Metric): + def __init__(self, name=None, *args, **kwargs): + super(SeqBeamAccuracy, self).__init__(*args, **kwargs) + self._name = 'seq_acc' + self.reset() + + def add_metric_op(self, output, label, mask, *args, **kwargs): + return output, label, mask + + def update(self, preds, labels, masks, *args, **kwargs): + preds = preds[:, :, np.newaxis] if len(preds.shape) == 2 else preds + preds = np.transpose(preds, [0, 2, 1]) + seq_len = np.sum(masks, -1) + acc = 0 + for i in range(labels.shape[0]): + l = int(seq_len[i] - 1) + #ref = labels[i][: l - 1] + ref = np.array(postprocess(labels[i])) + pred = preds[i] + for idx, beam in enumerate(pred): + beam_pred = np.array(postprocess(beam)) + if np.array_equal(beam_pred, ref): + self.total += 1 + acc += 1 + break + self.count += 1 + return float(acc) / labels.shape[0] + + def reset(self): + self.total = 0. + self.count = 0. + + def accumulate(self): + return float(self.total) / self.count + + def name(self): + return self._name diff --git a/hapi/callbacks.py b/hapi/callbacks.py index 7d46183..f02eec1 100644 --- a/hapi/callbacks.py +++ b/hapi/callbacks.py @@ -218,8 +218,6 @@ class ProgBarLogger(Callback): # if steps is not None, last step will update in on_epoch_end if self.steps and self.train_step < self.steps: self._updates(logs, 'train') - else: - self._updates(logs, 'train') def on_epoch_end(self, epoch, logs=None): logs = logs or {} @@ -238,7 +236,7 @@ class ProgBarLogger(Callback): def on_eval_batch_end(self, step, logs=None): logs = logs or {} - self.eval_step = step + self.eval_step += 1 samples = logs.get('batch_size', 1) self.evaled_samples += samples diff --git a/hapi/datasets/folder.py b/hapi/datasets/folder.py index 5c728a6..23f2c95 100644 --- a/hapi/datasets/folder.py +++ b/hapi/datasets/folder.py @@ -18,7 +18,7 @@ import cv2 from paddle.io import Dataset -__all__ = ["DatasetFolder"] +__all__ = ["DatasetFolder", "ImageFolder"] def has_valid_extension(filename, extensions): @@ -164,3 +164,80 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', def cv2_loader(path): return cv2.imread(path) + + +class ImageFolder(Dataset): + """A generic data loader where the samples are arranged in this way: + + root/1.ext + root/2.ext + root/sub_dir/3.ext + + Args: + root (string): Root directory path. + loader (callable, optional): A function to load a sample given its path. + extensions (tuple[string], optional): A list of allowed extensions. + both extensions and is_valid_file should not be passed. + transform (callable, optional): A function/transform that takes in + a sample and returns a transformed version. + is_valid_file (callable, optional): A function that takes path of a file + and check if the file is a valid file (used to check of corrupt files) + both extensions and is_valid_file should not be passed. + + Attributes: + samples (list): List of sample path + """ + + def __init__(self, + root, + loader=None, + extensions=None, + transform=None, + is_valid_file=None): + self.root = root + if extensions is None: + extensions = IMG_EXTENSIONS + + samples = [] + path = os.path.expanduser(root) + if not ((extensions is None) ^ (is_valid_file is None)): + raise ValueError( + "Both extensions and is_valid_file cannot be None or not None at the same time" + ) + if extensions is not None: + + def is_valid_file(x): + return has_valid_extension(x, extensions) + + for root, _, fnames in sorted(os.walk(path, followlinks=True)): + for fname in sorted(fnames): + f = os.path.join(root, fname) + if is_valid_file(f): + samples.append(f) + + if len(samples) == 0: + raise (RuntimeError( + "Found 0 files in subfolders of: " + self.root + "\n" + "Supported extensions are: " + ",".join(extensions))) + + self.loader = cv2_loader if loader is None else loader + self.extensions = extensions + self.samples = samples + self.transform = transform + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + path = self.samples[index] + sample = self.loader(path) + if self.transform is not None: + sample = self.transform(sample) + return [sample] + + def __len__(self): + return len(self.samples) diff --git a/hapi/model.py b/hapi/model.py index f8752cf..effae5c 100644 --- a/hapi/model.py +++ b/hapi/model.py @@ -1161,7 +1161,7 @@ class Model(fluid.dygraph.Layer): if fluid.in_dygraph_mode(): feed_list = None else: - feed_list = [x.forward() for x in self._inputs + self._labels] + feed_list = [x.forward() for x in self._inputs] if test_data is not None and isinstance(test_data, Dataset): test_sampler = DistributedBatchSampler( @@ -1236,10 +1236,10 @@ class Model(fluid.dygraph.Layer): callbacks.on_batch_begin(mode, step, logs) if mode == 'train': outs = self.train_batch(data[:len(self._inputs)], - data[len(self._inputs):]) + data[len(self._inputs):]) else: outs = self.eval_batch(data[:len(self._inputs)], - data[len(self._inputs):]) + data[len(self._inputs):]) # losses loss = outs[0] if self._metrics else outs -- GitLab