diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 9c6ce120974701b372fb091fcd40038f790444d3..e40d82916ed30f39641e88e73c6563cc4cd0183b 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -198,21 +198,31 @@ For some data that are difficult to recognize, the recognition results will not - Enter the following command in the terminal to execute the dataset division script: - ``` + ``` cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder - python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec + python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` Parameter Description: - `trainValTestRatio` is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is `6:2:2` - - `labelRootPath` is the storage path of the dataset labeled by PPOCRLabel, the default is `../train_data/label` - - - `detRootPath` is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/det` - - - `recRootPath` is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is `../train_data/rec` - + - `datasetRootPath` is the storage path of the complete dataset labeled by PPOCRLabel. The default path is `PaddleOCR/train_data` . + ``` + |-train_data + |-crop_img + |- word_001_crop_0.png + |- word_002_crop_0.jpg + |- word_003_crop_0.jpg + | ... + | Label.txt + | rec_gt.txt + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + ``` + ### 3.6 Error message - If paddleocr is installed with whl, it has a higher priority than calling PaddleOCR class with paddleocr.py, which may cause an exception if whl package is not updated. diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 2226336631c68a892e3a7075b2dc8d65bccdf204..4f15e269b170a5d9431c905a160d889a3c27d5a9 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -185,19 +185,29 @@ PPOCRLabel支持三种导出方式: ``` cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下 -python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec +python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --datasetRootPath ../train_data ``` 参数说明: - `trainValTestRatio` 是训练集、验证集、测试集的图像数量划分比例,根据实际情况设定,默认是`6:2:2` -- `labelRootPath` 是PPOCRLabel标注的数据集存放路径,默认是`../train_data/label` - -- `detRootPath` 是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径,默认是`../train_data/det ` - -- `recRootPath` 是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径,默认是`../train_data/rec` - +- `datasetRootPath` 是PPOCRLabel标注的完整数据集存放路径。默认路径是 `PaddleOCR/train_data` 分割数据集前应有如下结构: + ``` + |-train_data + |-crop_img + |- word_001_crop_0.png + |- word_002_crop_0.jpg + |- word_003_crop_0.jpg + | ... + | Label.txt + | rec_gt.txt + |- word_001.png + |- word_002.jpg + |- word_003.jpg + | ... + ``` + ### 3.6 错误提示 - 如果同时使用whl包安装了paddleocr,其优先级大于通过paddleocr.py调用PaddleOCR类,whl包未更新时会导致程序异常。 diff --git a/PPOCRLabel/gen_ocr_train_val_test.py b/PPOCRLabel/gen_ocr_train_val_test.py index 64cba612ae267835dd47aedc2b0356c9df462038..03ae566c6ec64d7ade229fb9571b0cd89ec189d4 100644 --- a/PPOCRLabel/gen_ocr_train_val_test.py +++ b/PPOCRLabel/gen_ocr_train_val_test.py @@ -17,15 +17,14 @@ def isCreateOrDeleteFolder(path, flag): return flagAbsPath -def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag): +def splitTrainVal(root, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag): # 按照指定的比例划分训练集、验证集、测试集 - labelPath = os.path.join(root, dir) - labelAbsPath = os.path.abspath(labelPath) + dataAbsPath = os.path.abspath(root) if flag == "det": - labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName) + labelFilePath = os.path.join(dataAbsPath, args.detLabelFileName) elif flag == "rec": - labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName) + labelFilePath = os.path.join(dataAbsPath, args.recLabelFileName) labelFileRead = open(labelFilePath, "r", encoding="UTF-8") labelFileContent = labelFileRead.readlines() @@ -38,9 +37,9 @@ def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, imageName = os.path.basename(imageRelativePath) if flag == "det": - imagePath = os.path.join(labelAbsPath, imageName) + imagePath = os.path.join(dataAbsPath, imageName) elif flag == "rec": - imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName)) + imagePath = os.path.join(dataAbsPath, "{}\\{}".format(args.recImageDirName, imageName)) # 按预设的比例划分训练集、验证集、测试集 trainValTestRatio = args.trainValTestRatio.split(":") @@ -90,15 +89,20 @@ def genDetRecTrainVal(args): recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8") recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8") - for root, dirs, files in os.walk(args.labelRootPath): + splitTrainVal(args.datasetRootPath, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt, + detTestTxt, "det") + + for root, dirs, files in os.walk(args.datasetRootPath): for dir in dirs: - splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt, - detTestTxt, "det") - splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt, - recTestTxt, "rec") + if dir == 'crop_img': + splitTrainVal(root, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt, + recTestTxt, "rec") + else: + continue break + if __name__ == "__main__": # 功能描述:分别划分检测和识别的训练集、验证集、测试集 # 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注, @@ -110,9 +114,9 @@ if __name__ == "__main__": default="6:2:2", help="ratio of trainset:valset:testset") parser.add_argument( - "--labelRootPath", + "--datasetRootPath", type=str, - default="../train_data/label", + default="../train_data/", help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..." ) parser.add_argument(