add create_dataset.py

ecd335c6 · xiaohang · 1b95b898 · ecd335c6 · ecd335c6
隐藏空白更改
内联并排

Showing with 86 addition and 0 deletion

tool/README.md tool/README.md +6 -0

tool/create_dataset.py tool/create_dataset.py +80 -0

未找到文件。
--- a/tool/README.md
+++ b/tool/README.md
+### Train on vgg recogniton txt
+- download mjsynth.tar.gz and unzip to current folder
+- copy annotation_train.txt annotation_test.txt annotation_val.txt to current
+- correct path info
+- create imagelist: cat annotation_train.imgs | awk -F / '{print $NF}' | awk -F _ '{print $2}' | tr [:upper:] [:lower:]
+- python create_dataset.py
--- a/tool/create_dataset.py
+++ b/tool/create_dataset.py
+import os
+import lmdb # install lmdb by "pip install lmdb"
+import cv2
+import numpy as np
+
+
+def checkImageIsValid(imageBin):
+    if imageBin is None:
+        return False
+    try:
+        imageBuf = np.fromstring(imageBin, dtype=np.uint8)
+        img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
+        imgH, imgW = img.shape[0], img.shape[1]
+        if imgH * imgW == 0:
+            return False
+        return True
+    except Exception:
+        return False
+
+
+def writeCache(env, cache):
+    with env.begin(write=True) as txn:
+        for k, v in cache.iteritems():
+            txn.put(k, v)
+
+
+def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
+    """
+    Create LMDB dataset for CRNN training.
+
+    ARGS:
+        outputPath    : LMDB output path
+        imagePathList : list of image path
+        labelList     : list of corresponding groundtruth texts
+        lexiconList   : (optional) list of lexicon lists
+        checkValid    : if true, check the validity of every image
+    """
+    assert(len(imagePathList) == len(labelList))
+    nSamples = len(imagePathList)
+    env = lmdb.open(outputPath, map_size=1099511627776)
+    cache = {}
+    cnt = 1
+    for i in xrange(nSamples):
+        imagePath = imagePathList[i]
+        label = labelList[i]
+        if not os.path.exists(imagePath):
+            print('%s does not exist' % imagePath)
+            continue
+        with open(imagePath, 'r') as f:
+            imageBin = f.read()
+        if checkValid:
+            #print('check %s' % imagePath)
+            #print('len(imageBin) = %d' % len(imageBin))
+            if len(imageBin) == 0 or (not checkImageIsValid(imageBin)):
+                print('%s is not a valid image' % imagePath)
+                continue
+
+        imageKey = 'image-%09d' % cnt
+        labelKey = 'label-%09d' % cnt
+        cache[imageKey] = imageBin
+        cache[labelKey] = label
+        if lexiconList:
+            lexiconKey = 'lexicon-%09d' % cnt
+            cache[lexiconKey] = ' '.join(lexiconList[i])
+        if cnt % 1000 == 0:
+            writeCache(env, cache)
+            cache = {}
+            print('Written %d / %d' % (cnt, nSamples))
+        cnt += 1
+    nSamples = cnt-1
+    cache['num-samples'] = str(nSamples)
+    writeCache(env, cache)
+    print('Created dataset with %d samples' % nSamples)
+
+
+if __name__ == '__main__':
+    imagePathList = open('annotation_train.imgs').read().split('\n')
+    labelList = open('annotation_train.labels').read().split('\n')
+    outputPath = 'data/train_lmdb'
+    createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True)