diff --git a/20211008154929.png b/20211008154929.png deleted file mode 100644 index 7a1c234c03d4b44ea11fb7bdf6d61231cde9ec95..0000000000000000000000000000000000000000 Binary files a/20211008154929.png and /dev/null differ diff --git a/20211008155029.png b/20211008155029.png deleted file mode 100644 index 26fc052ebb3734c92846f35bff70d6f8e53e633e..0000000000000000000000000000000000000000 Binary files a/20211008155029.png and /dev/null differ diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 9e5b3245b0cfb56d300155a94f64d38edcdbb599..2b3cd5b77565c83173206199b2e13a04857d7239 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -204,6 +204,24 @@ For some data that are difficult to recognize, the recognition results will not pip install opencv-contrib-python-headless==4.2.0.32 ``` +### Dataset division + +- Enter the following command in the terminal to execute the dataset division script: + ``` + cd ./PPOCRLabel # Change the directory to the PPOCRLabel folder + python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec + ``` + +- Parameter Description: + + trainValTestRatio is the division ratio of the number of images in the training set, validation set, and test set, set according to your actual situation, the default is 6:2:2 + + labelRootPath is the storage path of the dataset labeled by PPOCRLabel, the default is ../train_data/label + + detRootPath is the path where the text detection dataset is divided according to the dataset marked by PPOCRLabel. The default is ../train_data/det + + recRootPath is the path where the character recognition dataset is divided according to the dataset marked by PPOCRLabel. The default is ../train_data/rec + ### Related 1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg) \ No newline at end of file diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index 7f9351dfe185be2417162f2c786f5eec0b58816a..10baae5034354f77962c7d1fbe1b188dcae3573f 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -193,7 +193,22 @@ PPOCRLabel支持三种导出方式: ``` pip install opencv-contrib-python-headless==4.2.0.32 ``` +### 数据集划分 +- 在终端中输入以下命令执行数据集划分脚本: + ``` + cd ./PPOCRLabel # 将目录切换到PPOCRLabel文件夹下 + python gen_ocr_train_val_test.py --trainValTestRatio 6:2:2 --labelRootPath ../train_data/label --detRootPath ../train_data/det --recRootPath ../train_data/rec + ``` +- 参数说明: + + trainValTestRatio是训练集、验证集、测试集的图像数量划分比例,根据你的实际情况设定,默认是6:2:2 + + labelRootPath是PPOCRLabel标注的数据集存放路径,默认是../train_data/label + + detRootPath是根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径,默认是../train_data/det + recRootPath是根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径,默认是../train_data/rec + ### 参考资料 1.[Tzutalin. LabelImg. Git code (2015)](https://github.com/tzutalin/labelImg) diff --git a/gen_ocr_train_val.py b/PPOCRLabel/gen_ocr_train_val_test.py similarity index 69% rename from gen_ocr_train_val.py rename to PPOCRLabel/gen_ocr_train_val_test.py index 8b3388921eb64c626b1b3078cc272784018c12c1..64cba612ae267835dd47aedc2b0356c9df462038 100644 --- a/gen_ocr_train_val.py +++ b/PPOCRLabel/gen_ocr_train_val_test.py @@ -5,45 +5,61 @@ import random import argparse -# 删除划分的训练集和验证集文件夹,重新创建一个空的文件夹 +# 删除划分的训练集、验证集、测试集文件夹,重新创建一个空的文件夹 def isCreateOrDeleteFolder(path, flag): flagPath = os.path.join(path, flag) + if os.path.exists(flagPath): shutil.rmtree(flagPath) + os.makedirs(flagPath) flagAbsPath = os.path.abspath(flagPath) return flagAbsPath -def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, trainTxt, valTxt, flag): - # 按照指定的比例划分训练集和验证集 +def splitTrainVal(root, dir, absTrainRootPath, absValRootPath, absTestRootPath, trainTxt, valTxt, testTxt, flag): + # 按照指定的比例划分训练集、验证集、测试集 labelPath = os.path.join(root, dir) labelAbsPath = os.path.abspath(labelPath) + if flag == "det": labelFilePath = os.path.join(labelAbsPath, args.detLabelFileName) elif flag == "rec": labelFilePath = os.path.join(labelAbsPath, args.recLabelFileName) + labelFileRead = open(labelFilePath, "r", encoding="UTF-8") labelFileContent = labelFileRead.readlines() random.shuffle(labelFileContent) labelRecordLen = len(labelFileContent) + for index, labelRecordInfo in enumerate(labelFileContent): imageRelativePath = labelRecordInfo.split('\t')[0] imageLabel = labelRecordInfo.split('\t')[1] imageName = os.path.basename(imageRelativePath) + if flag == "det": imagePath = os.path.join(labelAbsPath, imageName) elif flag == "rec": imagePath = os.path.join(labelAbsPath, "{}\\{}".format(args.recImageDirName, imageName)) - # 小于划分比例trainValRatio时,数据集划分到训练集,否则测试集 - if index / labelRecordLen < args.trainValRatio: + + # 按预设的比例划分训练集、验证集、测试集 + trainValTestRatio = args.trainValTestRatio.split(":") + trainRatio = eval(trainValTestRatio[0]) / 10 + valRatio = trainRatio + eval(trainValTestRatio[1]) / 10 + curRatio = index / labelRecordLen + + if curRatio < trainRatio: imageCopyPath = os.path.join(absTrainRootPath, imageName) shutil.copy(imagePath, imageCopyPath) trainTxt.write("{}\t{}".format(imageCopyPath, imageLabel)) - else: + elif curRatio >= trainRatio and curRatio < valRatio: imageCopyPath = os.path.join(absValRootPath, imageName) shutil.copy(imagePath, imageCopyPath) valTxt.write("{}\t{}".format(imageCopyPath, imageLabel)) + else: + imageCopyPath = os.path.join(absTestRootPath, imageName) + shutil.copy(imagePath, imageCopyPath) + testTxt.write("{}\t{}".format(imageCopyPath, imageLabel)) # 删掉存在的文件 @@ -55,48 +71,59 @@ def removeFile(path): def genDetRecTrainVal(args): detAbsTrainRootPath = isCreateOrDeleteFolder(args.detRootPath, "train") detAbsValRootPath = isCreateOrDeleteFolder(args.detRootPath, "val") + detAbsTestRootPath = isCreateOrDeleteFolder(args.detRootPath, "test") recAbsTrainRootPath = isCreateOrDeleteFolder(args.recRootPath, "train") recAbsValRootPath = isCreateOrDeleteFolder(args.recRootPath, "val") + recAbsTestRootPath = isCreateOrDeleteFolder(args.recRootPath, "test") + removeFile(os.path.join(args.detRootPath, "train.txt")) removeFile(os.path.join(args.detRootPath, "val.txt")) + removeFile(os.path.join(args.detRootPath, "test.txt")) removeFile(os.path.join(args.recRootPath, "train.txt")) removeFile(os.path.join(args.recRootPath, "val.txt")) + removeFile(os.path.join(args.recRootPath, "test.txt")) + detTrainTxt = open(os.path.join(args.detRootPath, "train.txt"), "a", encoding="UTF-8") detValTxt = open(os.path.join(args.detRootPath, "val.txt"), "a", encoding="UTF-8") + detTestTxt = open(os.path.join(args.detRootPath, "test.txt"), "a", encoding="UTF-8") recTrainTxt = open(os.path.join(args.recRootPath, "train.txt"), "a", encoding="UTF-8") recValTxt = open(os.path.join(args.recRootPath, "val.txt"), "a", encoding="UTF-8") + recTestTxt = open(os.path.join(args.recRootPath, "test.txt"), "a", encoding="UTF-8") + for root, dirs, files in os.walk(args.labelRootPath): for dir in dirs: - splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detTrainTxt, detValTxt, "det") - splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recTrainTxt, recValTxt, "rec") + splitTrainVal(root, dir, detAbsTrainRootPath, detAbsValRootPath, detAbsTestRootPath, detTrainTxt, detValTxt, + detTestTxt, "det") + splitTrainVal(root, dir, recAbsTrainRootPath, recAbsValRootPath, recAbsTestRootPath, recTrainTxt, recValTxt, + recTestTxt, "rec") break if __name__ == "__main__": - # 功能描述:分别划分检测和识别的训练集和验证集 + # 功能描述:分别划分检测和识别的训练集、验证集、测试集 # 说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注, - # 如此会有多个标注好的图像文件夹汇总并划分训练集和验证集的需求 + # 如此会有多个标注好的图像文件夹汇总并划分训练集、验证集、测试集的需求 parser = argparse.ArgumentParser() parser.add_argument( - "--trainValRatio", - type=float, - default=0.8, - help="ratio of training set to validation set") + "--trainValTestRatio", + type=str, + default="6:2:2", + help="ratio of trainset:valset:testset") parser.add_argument( "--labelRootPath", type=str, - default="./train_data/label", + default="../train_data/label", help="path to the dataset marked by ppocrlabel, E.g, dataset folder named 1,2,3..." ) parser.add_argument( "--detRootPath", type=str, - default="./train_data/det", + default="../train_data/det", help="the path where the divided detection dataset is placed") parser.add_argument( "--recRootPath", type=str, - default="./train_data/rec", + default="../train_data/rec", help="the path where the divided recognition dataset is placed" ) parser.add_argument( diff --git a/gen_ocr_train_val.bat b/gen_ocr_train_val.bat deleted file mode 100644 index c1fb476fb8f465b6120548094778b32f7dc678ee..0000000000000000000000000000000000000000 --- a/gen_ocr_train_val.bat +++ /dev/null @@ -1 +0,0 @@ -python gen_ocr_train_val.py --trainValRatio 0.8 --labelRootPath ./train_data/label --detRootPath ./train_data/det --recRootPath ./train_data/rec \ No newline at end of file diff --git "a/gen_ocr_train_val.py\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/gen_ocr_train_val.py\344\275\277\347\224\250\350\257\264\346\230\216.md" deleted file mode 100644 index e638847c68d6f05336800453141620d3c5e58eca..0000000000000000000000000000000000000000 --- "a/gen_ocr_train_val.py\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ /dev/null @@ -1,32 +0,0 @@ -1、功能描述:分别划分检测和识别的训练集和验证集 - -2、说明:可以根据自己的路径和需求调整参数,图像数据往往多人合作分批标注,每一批图像数据放在一个文件夹内用PPOCRLabel进行标注,如此会有多个标注好的图像文件夹汇总并划分训练集和验证集的需求。 - -3、使用方法: - -3.1 首先使用PPOCRLabel标注好图像,一般是分批次标注,多个标注好的图像文件夹存放在train_data目录下的label文件夹里,文件夹没有自己创建,label同级路径下创建det文件夹存放划分好的文本检测数据集,label同级路径下创建rec文件夹存放划分好的字符识别数据集,目录结构如下图所示: - -![20211008154929](20211008154929.png) -![20211008155029](20211008155029.png) - -3.2 gen_ocr_train_val.py参数说明 - -trainValRatio 训练集和验证集的图像数量划分比例,根据你的实际情况设定,默认是0.8 - -labelRootPath PPOCRLabel标注的数据集存放路径,默认是./train_data/label - -detRootPath 根据PPOCRLabel标注的数据集划分后的文本检测数据集存放的路径 - -recRootPath 根据PPOCRLabel标注的数据集划分后的字符识别数据集存放的路径 - -detLabelFileName 使用PPOCRLabel标注图像时,人工确认过的标注结果会存放在Label.txt内 - -recLabelFileName 使用PPOCRLabel标注图像时,点击导出识别结果后,会对人工确认过的字符标注结果进行字符裁剪,生成裁剪后的字符图像路径以及字符图像对应的字符标签保存到rec_gt.txt中 - -recImageDirName 使用PPOCRLabel标注图像时,点击导出识别结果后,会把裁剪后的字符图像保存到crop_img文件夹内 - -3.3 执行gen_ocr_train_val.py方法 -如果目录结构和文件夹名称是严格按照以上说明创建的,可以直接在windows环境下执行gen_ocr_train_val.bat,在linux环境下需要执行gen_ocr_train_val.sh,默认划分比例是0.8 -也可以在终端中输入以下命令执行: -python gen_ocr_train_val.py --trainValRatio 0.8 --labelRootPath ./train_data/label --detRootPath ./train_data/det --recRootPath ./train_data/rec -如果想创建自己的目录结构和文件夹名称,需要手动修改命令里的路径 diff --git a/gen_ocr_train_val.sh b/gen_ocr_train_val.sh deleted file mode 100644 index efd66dce07e5204aba489d8767a637058fe8b295..0000000000000000000000000000000000000000 --- a/gen_ocr_train_val.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -python gen_ocr_train_val.py --trainValRatio 0.8 --labelRootPath ./train_data/label --detRootPath ./train_data/det --recRootPath ./train_data/rec \ No newline at end of file