voc2coco.py

# convert VOC xml to COCO format json
import xml.etree.ElementTree as ET
import os
import json
import argparse


# create and init coco json, img set, and class set
def init_json():
    # create coco json
    coco = dict()
    coco['images'] = []
    coco['type'] = 'instances'
    coco['annotations'] = []
    coco['categories'] = []
    # voc classes
    voc_class = [
        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
    ]
    # init json categories
    image_set = set()
    class_set = dict()
    for cat_id, cat_name in enumerate(voc_class):
        cat_item = dict()
        cat_item['supercategory'] = 'none'
        cat_item['id'] = cat_id
        cat_item['name'] = cat_name
        coco['categories'].append(cat_item)
        class_set[cat_name] = cat_id
    return coco, class_set, image_set


def getImgItem(file_name, size, img_id):
    if file_name is None:
        raise Exception('Could not find filename tag in xml file.')
    if size['width'] is None:
        raise Exception('Could not find width tag in xml file.')
    if size['height'] is None:
        raise Exception('Could not find height tag in xml file.')
    image_item = dict()
    image_item['id'] = img_id
    image_item['file_name'] = file_name
    image_item['width'] = size['width']
    image_item['height'] = size['height']
    return image_item


def getAnnoItem(object_name, image_id, ann_id, category_id, bbox):
    annotation_item = dict()
    annotation_item['segmentation'] = []
    seg = []
    # bbox[] is x,y,w,h
    # left_top
    seg.append(bbox[0])
    seg.append(bbox[1])
    # left_bottom
    seg.append(bbox[0])
    seg.append(bbox[1] + bbox[3])
    # right_bottom
    seg.append(bbox[0] + bbox[2])
    seg.append(bbox[1] + bbox[3])
    # right_top
    seg.append(bbox[0] + bbox[2])
    seg.append(bbox[1])

    annotation_item['segmentation'].append(seg)

    annotation_item['area'] = bbox[2] * bbox[3]
    annotation_item['iscrowd'] = 0
    annotation_item['ignore'] = 0
    annotation_item['image_id'] = image_id
    annotation_item['bbox'] = bbox
    annotation_item['category_id'] = category_id
    annotation_item['id'] = ann_id
    return annotation_item


def convert_voc_to_coco(txt_path, json_path, xml_path):

    # create and init coco json, img set, and class set
    coco_json, class_set, image_set = init_json()

    ### collect img and ann info into coco json
    # read img_name in txt, e.g., 000005 for voc2007, 2008_000002 for voc2012
    img_txt = open(txt_path, 'r')
    img_line = img_txt.readline().strip()

    # loop xml 
    img_id = 0
    ann_id = 0
    while img_line:
        print('img_id:', img_id)

        # find corresponding xml
        xml_name = img_line.split('Annotations/', 1)[1]
        xml_file = os.path.join(xml_path, xml_name)
        if not os.path.exists(xml_file):
            print('{} is not exists.'.format(xml_name))
            img_line = img_txt.readline().strip()
            continue

        # decode xml
        tree = ET.parse(xml_file)
        root = tree.getroot()
        if root.tag != 'annotation':
            raise Exception(
                'xml {} root element should be annotation, rather than {}'.
                format(xml_name, root.tag))

        # init img and ann info
        bndbox = dict()
        size = dict()
        size['width'] = None
        size['height'] = None
        size['depth'] = None

        # filename
        fileNameNode = root.find('filename')
        file_name = fileNameNode.text

        # img size
        sizeNode = root.find('size')
        if not sizeNode:
            raise Exception('xml {} structure broken at size tag.'.format(
                xml_name))
        for subNode in sizeNode:
            size[subNode.tag] = int(subNode.text)

        # add img into json
        if file_name not in image_set:
            img_id += 1
            format_img_id = int("%04d" % img_id)
            # print('line 120. format_img_id:', format_img_id)
            image_item = getImgItem(file_name, size, img_id)
            image_set.add(file_name)
            coco_json['images'].append(image_item)
        else:
            raise Exception(' xml {} duplicated image: {}'.format(xml_name,
                                                                  file_name))

        ### add objAnn into json
        objectAnns = root.findall('object')
        for objectAnn in objectAnns:
            bndbox['xmin'] = None
            bndbox['xmax'] = None
            bndbox['ymin'] = None
            bndbox['ymax'] = None

            #add obj category
            object_name = objectAnn.find('name').text
            if object_name not in class_set:
                raise Exception('xml {} Unrecognized category: {}'.format(
                    xml_name, object_name))
            else:
                current_category_id = class_set[object_name]

            #add obj bbox ann
            objectBboxNode = objectAnn.find('bndbox')
            for coordinate in objectBboxNode:
                if bndbox[coordinate.tag] is not None:
                    raise Exception('xml {} structure corrupted at bndbox tag.'.
                                    format(xml_name))
                bndbox[coordinate.tag] = int(float(coordinate.text))
            bbox = []
            # x
            bbox.append(bndbox['xmin'])
            # y
            bbox.append(bndbox['ymin'])
            # w
            bbox.append(bndbox['xmax'] - bndbox['xmin'])
            # h
            bbox.append(bndbox['ymax'] - bndbox['ymin'])
            ann_id += 1
            ann_item = getAnnoItem(object_name, img_id, ann_id,
                                   current_category_id, bbox)
            coco_json['annotations'].append(ann_item)

        img_line = img_txt.readline().strip()

    print('Saving json.')
    json.dump(coco_json, open(json_path, 'w'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--type', type=str, default='VOC2007_test', help="data type")
    parser.add_argument(
        '--base_path',
        type=str,
        default='dataset/voc/VOCdevkit',
        help="base VOC path.")
    args = parser.parse_args()

    # image info path
    txt_name = args.type + '.txt'
    json_name = args.type + '.json'
    txt_path = os.path.join(args.base_path, 'PseudoAnnotations', txt_name)
    json_path = os.path.join(args.base_path, 'PseudoAnnotations', json_name)

    # xml path
    xml_path = os.path.join(args.base_path,
                            args.type.split('_')[0], 'Annotations')

    print('txt_path:', txt_path)
    print('json_path:', json_path)
    print('xml_path:', xml_path)

    print('Converting {} to COCO json.'.format(args.type))
    convert_voc_to_coco(txt_path, json_path, xml_path)
    print('Finished.')