convert_ppocr_label.py 2.4 KB
Newer Older
L
LDOUBLEV 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
import numpy as np
import json
import os


def poly_to_string(poly):
    if len(poly.shape) > 1:
        poly = np.array(poly).flatten()

    string = "\t".join(str(i) for i in poly)
    return string


def convert_label(label_dir, mode="gt", save_dir="./save_results/"):
    if not os.path.exists(label_dir):
        raise ValueError(f"The file {label_dir} does not exist!")

    assert label_dir != save_dir, "hahahhaha"

    label_file = open(label_dir, 'r')
    data = label_file.readlines()

    gt_dict = {}

    for line in data:
        try:
            tmp = line.split('\t')
            assert len(tmp) == 2, ""
        except:
            tmp = line.strip().split('    ')

        gt_lists = []

        if tmp[0].split('/')[0] is not None:
            img_path = tmp[0]
            anno = json.loads(tmp[1])
            gt_collect = []
            for dic in anno:
                #txt = dic['transcription'].replace(' ', '')  # ignore blank
                txt = dic['transcription']
                if 'score' in dic and float(dic['score']) < 0.5:
                    continue
                if u'\u3000' in txt: txt = txt.replace(u'\u3000', u' ')
                #while ' ' in txt:
                #    txt = txt.replace(' ', '')
                poly = np.array(dic['points']).flatten()
                if txt == "###":
                    txt_tag = 1  ## ignore 1
                else:
                    txt_tag = 0
                if mode == "gt":
                    gt_label = poly_to_string(poly) + "\t" + str(
                        txt_tag) + "\t" + txt + "\n"
                else:
                    gt_label = poly_to_string(poly) + "\t" + txt + "\n"

                gt_lists.append(gt_label)

            gt_dict[img_path] = gt_lists
        else:
            continue

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for img_name in gt_dict.keys():
        save_name = img_name.split("/")[-1]
        save_file = os.path.join(save_dir, save_name + ".txt")
        with open(save_file, "w") as f:
            f.writelines(gt_dict[img_name])

    print("The convert label saved in {}".format(save_dir))


if __name__ == "__main__":

    ppocr_label_gt = "/paddle/Datasets/chinese/test_set/Label_refine_310_V2.txt"
    convert_label(ppocr_label_gt, "gt", "./save_gt_310_V2/")

    ppocr_label_gt = "./infer_results/ch_PPOCRV2_infer.txt"
    convert_label(ppocr_label_gt_en, "pred", "./save_PPOCRV2_infer/")