add shorter mode to pooling images to 1/8 width

773cbfb3 · breezedeus · 3f3caf6e · 773cbfb3 · 773cbfb3 · 773cbfb3
Showing with 68 addition and 43 deletion

cnocr/consts.py cnocr/consts.py +9 -1

cnocr/symbols/crnn.py cnocr/symbols/crnn.py +15 -7

cnocr/symbols/densenet.py cnocr/symbols/densenet.py +23 -20

tests/test_models.py tests/test_models.py +21 -15

未找到文件。
--- a/cnocr/consts.py
+++ b/cnocr/consts.py
@@ -7,7 +7,15 @@ from .__version__ import __version__
 # 如: __version__ = '1.2.*'，对应的 MODEL_VERSION 都是 '1.2.0'
 MODEL_VERSION = '.'.join(__version__.split('.', maxsplit=2)[:2]) + '.0'

-EMB_MODEL_TYPES = ['conv', 'conv-lite', 'densenet', 'densenet-lite']
+EMB_MODEL_TYPES = [
+    'conv',  # seq_len == 35, deprecated
+    'conv-lite',  # seq_len == 69
+    'conv-lite-s',  # seq_len == 35
+    'densenet',  # seq_len == 70, deprecated
+    'densenet-lite',  # seq_len == 70
+    'densenet-s',  # seq_len == 35
+    'densenet-lite-s',  # seq_len == 35
+]
 SEQ_MODEL_TYPES = ['lstm', 'gru', 'fc']

 root_url = (

--- a/cnocr/symbols/crnn.py
+++ b/cnocr/symbols/crnn.py
@@ -35,19 +35,25 @@ def gen_network(model_name, hp):
    model_name = model_name.lower()
    if model_name.startswith('densenet'):
        hp.seq_len_cmpr_ratio = 4
-        hp.set_seq_length(hp.img_width // 4)
        layer_channels = (
            (32, 64, 128, 256)
            if model_name.startswith('densenet-lite')
            else (64, 128, 256, 512)
        )
-        densenet = DenseNet(layer_channels)
+        shorter = model_name.startswith('densenet-s-') or model_name.startswith(
+            'densenet-lite-s-'
+        )
+        seq_len = hp.img_width // 8 if shorter else hp.img_width // 4
+        hp.set_seq_length(seq_len)
+        densenet = DenseNet(layer_channels, shorter=shorter)
        densenet.hybridize()
        model = CRnn(hp, densenet)
    elif model_name.startswith('conv-lite'):
        hp.seq_len_cmpr_ratio = 4
-        hp.set_seq_length(hp.img_width // 4 - 1)
-        model = lambda data: crnn_lstm_lite(hp, data)
+        shorter = model_name.startswith('conv-lite-s-')
+        seq_len = hp.img_width // 8 if shorter else hp.img_width // 4 - 1
+        hp.set_seq_length(seq_len)
+        model = lambda data: crnn_lstm_lite(hp, data, shorter=shorter)
    elif model_name.startswith('conv'):
        hp.seq_len_cmpr_ratio = 8
        hp.set_seq_length(hp.img_width // 8)
@@ -254,7 +260,7 @@ def crnn_lstm(hp, data):
    return hidden_concat


-def crnn_lstm_lite(hp, data):
+def crnn_lstm_lite(hp, data, *, shorter=False):
    kernel_size = [(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]
    padding_size = [(1, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]
    layer_size = [min(32 * 2 ** (i + 1), 512) for i in range(len(kernel_size))]
@@ -289,9 +295,11 @@ def crnn_lstm_lite(hp, data):
    # print('4', net.infer_shape()[1])
    net = bottle_conv(4, net, kernel_size[4], layer_size[4], padding_size[4])
    net = bottle_conv(5, net, kernel_size[5], layer_size[5], padding_size[5], True) + x
-    # res: bz x 512 x 4 x 69，长度从70变成69的原因是pooling后没用padding
+    width_stride = 2 if shorter else 1
+    # res: bz x 512 x 4 x 69 or bz x 512 x 4 x 35
+    #  长度从70变成69的原因是pooling后没用padding
    net = mx.symbol.Pooling(
-        data=net, name='pool-2', pool_type='max', kernel=(2, 2), stride=(2, 1)
+        data=net, name='pool-2', pool_type='max', kernel=(2, 2), stride=(2, width_stride)
    )
    # print('5', net.infer_shape()[1])
    # net = mx.symbol.Convolution(name='conv-%d' % 6, data=net, kernel=(4, 1), num_filter=layer_size[5])

--- a/cnocr/symbols/densenet.py
+++ b/cnocr/symbols/densenet.py
@@ -72,29 +72,23 @@ def _make_residual(cell_net):


 class DenseNet(HybridBlock):
-    r"""Densenet-BC model from the
+    r"""Densenet model adapted with DenseNet in Gluon.
+        "from gluoncv.model_zoo.densenet import DenseNet"
+
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.

    Parameters
    ----------
-    num_init_features : int
-        Number of filters to learn in the first convolution layer.
-    growth_rate : int
-        Number of filters to add each layer (`k` in the paper).
-    block_config : list of int
-        List of integers for numbers of layers in each pooling block.
-    bn_size : int, default 4
-        Multiplicative factor for number of bottle neck layers.
-        (i.e. bn_size * k features in the bottleneck layer)
-    dropout : float, default 0
-        Rate of dropout after each dense layer.
-    classes : int, default 1000
-        Number of classification classes.
+
+    layer_channels: tuple or list with length 4,
+                    such as `layer_channels = (64, 128, 256, 512)`
+    shorter: pooling to 1/8 length if shorter is True, else pooling to 1/4
    """

-    def __init__(self, layer_channels, **kwargs):
+    def __init__(self, layer_channels, *, shorter=False, **kwargs):
        assert len(layer_channels) == 4
        super(DenseNet, self).__init__(**kwargs)
+        self.shorter = shorter
        with self.name_scope():
            # Stage 0
            self.features = nn.HybridSequential(prefix='')
@@ -123,7 +117,12 @@ class DenseNet(HybridBlock):
            self.features.add(_make_last_transition(layer_channels[3]))

            # Stage 3
-            self.features.add(_make_final_stage_net(3, out_channels=layer_channels[3]))
+            pool_size = strides = (2, 2) if self.shorter else (2, 1)
+            self.features.add(
+                _make_final_stage_net(
+                    3, pool_size, strides, out_channels=layer_channels[3]
+                )
+            )

            # num_features = num_init_features
            # for i, num_layers in enumerate(block_config):
@@ -147,8 +146,12 @@ class DenseNet(HybridBlock):
        :return: with shape (batch_size, embed_size, 1, img_width // 4)
        """
        x = self.features(x)  # res: (batch_size, embed_size, 2, img_width // 4)
-        x = F.reshape(x, (0, -3, 0))  # res: (batch_size, embed_size * 2, img_width // 4)
-        x = F.expand_dims(x, axis=2)  # res: (batch_size, embed_size * 2, 1, img_width // 4)
+        x = F.reshape(
+            x, (0, -3, 0)
+        )  # res: (batch_size, embed_size * 2, img_width // 4)
+        x = F.expand_dims(
+            x, axis=2
+        )  # res: (batch_size, embed_size * 2, 1, img_width // 4)
        return x


@@ -212,7 +215,7 @@ def _make_last_transition(num_output_features):
    return out


-def _make_final_stage_net(stage_index, out_channels):
+def _make_final_stage_net(stage_index, pool_size, strides, out_channels):
    features = nn.HybridSequential(prefix='stage%d_' % stage_index)
    with features.name_scope():
        features.add(nn.BatchNorm())
@@ -225,5 +228,5 @@ def _make_final_stage_net(stage_index, out_channels):
        # )
        # features.add(nn.BatchNorm())
        # features.add(nn.Activation('relu'))
-        features.add(nn.MaxPool2D(pool_size=(2, 1), strides=(2, 1)))
+        features.add(nn.MaxPool2D(pool_size=pool_size, strides=strides))
    return features
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -37,15 +37,19 @@ def test_dense_layer():


 def test_densenet():
-    x = nd.random.randn(128, 64, 32, 280)
+    width = 280
+    x = nd.random.randn(128, 64, 32, width)
    layer_channels = (64, 128, 256, 512)
-    net = DenseNet(layer_channels)
-    net.initialize()
-    y = net(x)
-    logger.info(net)
-    logger.info(y.shape)  # (128, 512, 1, 70)
-    assert y.shape[2] == 1
-    logger.info('number of parameters: %d', cal_num_params(net))  # 1748224
+    for shorter in (False, True):
+        net = DenseNet(layer_channels, shorter=shorter)
+        net.initialize()
+        y = net(x)
+        logger.info(net)
+        logger.info(y.shape)  # (128, 512, 1, 70) or (128, 512, 1, 35)
+        assert y.shape[2] == 1
+        expected_seq_len = width // 8 if shorter else width // 4
+        assert y.shape[3] == expected_seq_len
+        logger.info('number of parameters: %d', cal_num_params(net))  # 1748224


 def test_crnn():
@@ -77,17 +81,19 @@ def test_crnn_lstm():

 def test_crnn_lstm_lite():
    hp = deepcopy(HP)
-    hp.set_seq_length(hp.img_width // 4 - 1)
-    data = mx.sym.Variable('data', shape=(128, 1, 32, 280))
-    pred = crnn_lstm_lite(HP, data)
-    pred_shape = pred.infer_shape()[1][0]
-    logger.info('shape of pred: %s', pred_shape)
-    assert pred_shape == (hp.seq_length, hp.batch_size, 2 * hp.num_hidden)
+    width = hp.img_width  # 280
+    data = mx.sym.Variable('data', shape=(128, 1, 32, width))
+    for shorter in (False, True):
+        pred = crnn_lstm_lite(HP, data, shorter=shorter)
+        pred_shape = pred.infer_shape()[1][0]
+        logger.info('shape of pred: %s', pred_shape)
+        seq_len = hp.img_width // 8 if shorter else hp.img_width // 4 - 1
+        assert pred_shape == (seq_len, hp.batch_size, 2 * hp.num_hidden)


 def test_pipline():
    hp = deepcopy(HP)
-    hp.set_seq_length(hp.img_width // 4 - 1)
+    hp.set_seq_length(hp.img_width // 4)
    hp._loss_type = None  # infer mode
    layer_channels_list = [(64, 128, 256, 512), (32, 64, 128, 256)]
    for layer_channels in layer_channels_list: