diff --git a/PaddleCV/yolov3/image_utils.py b/PaddleCV/yolov3/image_utils.py index a35e20be71fe10cce2c4629cb64ea4c5e74cfe27..2e713525f3d04e5145c98709894d88df4adf4446 100644 --- a/PaddleCV/yolov3/image_utils.py +++ b/PaddleCV/yolov3/image_utils.py @@ -51,7 +51,14 @@ def random_distort(img): return img -def random_crop(img, boxes, labels, scores, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50): +def random_crop(img, + boxes, + labels, + scores, + scales=[0.3, 1.0], + max_ratio=2.0, + constraints=None, + max_trial=50): if len(boxes) == 0: return img, boxes @@ -90,10 +97,12 @@ def random_crop(img, boxes, labels, scores, scales=[0.3, 1.0], max_ratio=2.0, co while crops: crop = crops.pop(np.random.randint(0, len(crops))) - crop_boxes, crop_labels, crop_scores, box_num = box_utils.box_crop(boxes, labels, scores, crop, (w, h)) + crop_boxes, crop_labels, crop_scores, box_num = \ + box_utils.box_crop(boxes, labels, scores, crop, (w, h)) if box_num < 1: continue - img = img.crop((crop[0], crop[1], crop[0] + crop[2], crop[1] + crop[3])).resize(img.size, Image.LANCZOS) + img = img.crop((crop[0], crop[1], crop[0] + crop[2], + crop[1] + crop[3])).resize(img.size, Image.LANCZOS) img = np.asarray(img) return img, crop_boxes, crop_labels, crop_scores img = np.asarray(img) @@ -118,10 +127,16 @@ def random_interp(img, size, interp=None): h, w, _ = img.shape im_scale_x = size / float(w) im_scale_y = size / float(h) - img = cv2.resize(img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=interp) + img = cv2.resize(img, None, None, fx=im_scale_x, fy=im_scale_y, + interpolation=interp) return img -def random_expand(img, gtboxes, max_ratio=4., fill=None, keep_ratio=True, thresh=0.5): +def random_expand(img, + gtboxes, + max_ratio=4., + fill=None, + keep_ratio=True, + thresh=0.5): if random.random() > thresh: return img, gtboxes @@ -153,13 +168,21 @@ def random_expand(img, gtboxes, max_ratio=4., fill=None, keep_ratio=True, thresh return out_img.astype('uint8'), gtboxes def shuffle_gtbox(gtbox, gtlabel, gtscore): - gt = np.concatenate([gtbox, gtlabel[:, np.newaxis], gtscore[:, np.newaxis]], axis=1) + gt = np.concatenate([gtbox, gtlabel[:, np.newaxis], + gtscore[:, np.newaxis]], axis=1) idx = np.arange(gt.shape[0]) np.random.shuffle(idx) gt = gt[idx, :] return gt[:, :4], gt[:, 4], gt[:, 5] -def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2, gtscores2): +def image_mixup(img1, + gtboxes1, + gtlabels1, + gtscores1, + img2, + gtboxes2, + gtlabels2, + gtscores2): factor = np.random.beta(1.5, 1.5) factor = max(0.0, min(1.0, factor)) if factor >= 1.0: @@ -173,7 +196,8 @@ def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2, w = max(img1.shape[1], img2.shape[1]) img = np.zeros((h, w, img1.shape[2]), 'float32') img[:img1.shape[0], :img1.shape[1], :] = img1.astype('float32') * factor - img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1.0 - factor) + img[:img2.shape[0], :img2.shape[1], :] += \ + img2.astype('float32') * (1.0 - factor) gtboxes = np.zeros_like(gtboxes1) gtlabels = np.zeros_like(gtlabels1) gtscores = np.zeros_like(gtscores1) @@ -208,7 +232,8 @@ def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2, def image_augment(img, gtboxes, gtlabels, gtscores, size, means=None): img = random_distort(img) img, gtboxes = random_expand(img, gtboxes, fill=means) - img, gtboxes, gtlabels, gtscores = random_crop(img, gtboxes, gtlabels, gtscores) + img, gtboxes, gtlabels, gtscores = \ + random_crop(img, gtboxes, gtlabels, gtscores) img = random_interp(img, size) img, gtboxes = random_flip(img, gtboxes) gtboxes, gtlabels, gtscores = shuffle_gtbox(gtboxes, gtlabels, gtscores) diff --git a/PaddleCV/yolov3/models/darknet.py b/PaddleCV/yolov3/models/darknet.py index 6fd89d425960afe1c2af1674d1ed37ca7fd47271..bfce6f3b69077745b1fe50836da37eabc9d30789 100644 --- a/PaddleCV/yolov3/models/darknet.py +++ b/PaddleCV/yolov3/models/darknet.py @@ -55,7 +55,13 @@ def conv_bn_layer(input, out = fluid.layers.leaky_relu(x=out, alpha=0.1) return out -def downsample(input, ch_out, filter_size=3, stride=2, padding=1, is_test=True, name=None): +def downsample(input, + ch_out, + filter_size=3, + stride=2, + padding=1, + is_test=True, + name=None): return conv_bn_layer(input, ch_out=ch_out, filter_size=filter_size, @@ -65,15 +71,19 @@ def downsample(input, ch_out, filter_size=3, stride=2, padding=1, is_test=True, name=name) def basicblock(input, ch_out, is_test=True, name=None): - conv1 = conv_bn_layer(input, ch_out, 1, 1, 0, is_test=is_test, name=name+".0") - conv2 = conv_bn_layer(conv1, ch_out*2, 3, 1, 1, is_test=is_test, name=name+".1") + conv1 = conv_bn_layer(input, ch_out, 1, 1, 0, + is_test=is_test, name=name+".0") + conv2 = conv_bn_layer(conv1, ch_out*2, 3, 1, 1, + is_test=is_test, name=name+".1") out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) return out def layer_warp(block_func, input, ch_out, count, is_test=True, name=None): - res_out = block_func(input, ch_out, is_test=is_test, name='{}.0'.format(name)) + res_out = block_func(input, ch_out, is_test=is_test, + name='{}.0'.format(name)) for j in range(1, count): - res_out = block_func(res_out, ch_out, is_test=is_test, name='{}.{}'.format(name, j)) + res_out = block_func(res_out, ch_out, is_test=is_test, + name='{}.{}'.format(name, j)) return res_out DarkNet_cfg = { @@ -83,14 +93,21 @@ DarkNet_cfg = { def add_DarkNet53_conv_body(body_input, is_test=True): stages, block_func = DarkNet_cfg[53] stages = stages[0:5] - conv1 = conv_bn_layer( - body_input, ch_out=32, filter_size=3, stride=1, padding=1, is_test=is_test, name="yolo_input") - downsample_ = downsample(conv1, ch_out=conv1.shape[1]*2, is_test=is_test, name="yolo_input.downsample") + conv1 = conv_bn_layer(body_input, ch_out=32, filter_size=3, + stride=1, padding=1, is_test=is_test, + name="yolo_input") + downsample_ = downsample(conv1, ch_out=conv1.shape[1]*2, + is_test=is_test, + name="yolo_input.downsample") blocks = [] for i, stage in enumerate(stages): - block = layer_warp(block_func, downsample_, 32 *(2**i), stage, is_test=is_test, name="stage.{}".format(i)) + block = layer_warp(block_func, downsample_, 32 *(2**i), + stage, is_test=is_test, + name="stage.{}".format(i)) blocks.append(block) if i < len(stages) - 1: # do not downsaple in the last stage - downsample_ = downsample(block, ch_out=block.shape[1]*2, is_test=is_test, name="stage.{}.downsample".format(i)) + downsample_ = downsample(block, ch_out=block.shape[1]*2, + is_test=is_test, + name="stage.{}.downsample".format(i)) return blocks[-1:-4:-1] diff --git a/PaddleCV/yolov3/models/yolov3.py b/PaddleCV/yolov3/models/yolov3.py index 55c0667b16097b4424c948798a5faccc8ad2e366..ef57abddf52811ea5cdc4ee4c54d3163accd86fb 100644 --- a/PaddleCV/yolov3/models/yolov3.py +++ b/PaddleCV/yolov3/models/yolov3.py @@ -27,13 +27,22 @@ from .darknet import add_DarkNet53_conv_body from .darknet import conv_bn_layer def yolo_detection_block(input, channel, is_test=True, name=None): - assert channel % 2 == 0, "channel {} cannot be divided by 2".format(channel) + assert channel % 2 == 0, \ + "channel {} cannot be divided by 2".format(channel) conv = input for j in range(2): - conv = conv_bn_layer(conv, channel, filter_size=1, stride=1, padding=0, is_test=is_test, name='{}.{}.0'.format(name, j)) - conv = conv_bn_layer(conv, channel*2, filter_size=3, stride=1, padding=1, is_test=is_test, name='{}.{}.1'.format(name, j)) - route = conv_bn_layer(conv, channel, filter_size=1, stride=1, padding=0, is_test=is_test, name='{}.2'.format(name)) - tip = conv_bn_layer(route,channel*2, filter_size=3, stride=1, padding=1, is_test=is_test, name='{}.tip'.format(name)) + conv = conv_bn_layer(conv, channel, filter_size=1, + stride=1, padding=0, is_test=is_test, + name='{}.{}.0'.format(name, j)) + conv = conv_bn_layer(conv, channel*2, filter_size=3, + stride=1, padding=1, is_test=is_test, + name='{}.{}.1'.format(name, j)) + route = conv_bn_layer(conv, channel, filter_size=1, stride=1, + padding=0, is_test=is_test, + name='{}.2'.format(name)) + tip = conv_bn_layer(route,channel*2, filter_size=3, stride=1, + padding=1, is_test=is_test, + name='{}.tip'.format(name)) return route, tip def upsample(input, scale=2,name=None): @@ -68,11 +77,15 @@ class YOLOv3(object): if self.is_train: self.py_reader = fluid.layers.py_reader( capacity=64, - shapes = [[-1] + self.image_shape, [-1, cfg.max_box_num, 4], [-1, cfg.max_box_num], [-1, cfg.max_box_num]], + shapes = [[-1] + self.image_shape, + [-1, cfg.max_box_num, 4], + [-1, cfg.max_box_num], + [-1, cfg.max_box_num]], lod_levels=[0, 0, 0, 0], dtypes=['float32'] * 2 + ['int32'] + ['float32'], use_double_buffer=True) - self.image, self.gtbox, self.gtlabel, self.gtscore = fluid.layers.read_file(self.py_reader) + self.image, self.gtbox, self.gtlabel, self.gtscore = \ + fluid.layers.read_file(self.py_reader) else: self.image = fluid.layers.data( name='image', shape=self.image_shape, dtype='float32' @@ -139,9 +152,9 @@ class YOLOv3(object): if self.is_train: loss = fluid.layers.yolov3_loss( x=out, - gtbox=self.gtbox, - gtlabel=self.gtlabel, - gtscore=self.gtscore, + gt_box=self.gtbox, + gt_label=self.gtlabel, + gt_score=self.gtscore, anchors=cfg.anchors, anchor_mask=anchor_mask, class_num=cfg.class_num, diff --git a/PaddleCV/yolov3/reader.py b/PaddleCV/yolov3/reader.py index 24e830dfc6487b90b266c70effe8984d5a661276..7d1f0de79a77da9a37e39988065751749d39f891 100644 --- a/PaddleCV/yolov3/reader.py +++ b/PaddleCV/yolov3/reader.py @@ -53,13 +53,17 @@ class DataSetReader(object): cfg.dataset)) if mode == 'train': - cfg.train_file_list = os.path.join(cfg.data_dir, cfg.train_file_list) - cfg.train_data_dir = os.path.join(cfg.data_dir, cfg.train_data_dir) + cfg.train_file_list = os.path.join(cfg.data_dir, + cfg.train_file_list) + cfg.train_data_dir = os.path.join(cfg.data_dir, + cfg.train_data_dir) self.COCO = COCO(cfg.train_file_list) self.img_dir = cfg.train_data_dir elif mode == 'test' or mode == 'infer': - cfg.val_file_list = os.path.join(cfg.data_dir, cfg.val_file_list) - cfg.val_data_dir = os.path.join(cfg.data_dir, cfg.val_data_dir) + cfg.val_file_list = os.path.join(cfg.data_dir, + cfg.val_file_list) + cfg.val_data_dir = os.path.join(cfg.data_dir, + cfg.val_data_dir) self.COCO = COCO(cfg.val_file_list) self.img_dir = cfg.val_data_dir @@ -88,7 +92,8 @@ class DataSetReader(object): def _parse_gt_annotations(self, img): img_height = img['height'] img_width = img['width'] - anno = self.COCO.loadAnns(self.COCO.getAnnIds(imgIds=img['id'], iscrowd=None)) + anno = self.COCO.loadAnns( + self.COCO.getAnnIds(imgIds=img['id'], iscrowd=None)) gt_index = 0 for target in anno: if target['area'] < cfg.gt_min_area: @@ -96,13 +101,15 @@ class DataSetReader(object): if 'ignore' in target and target['ignore']: continue - box = box_utils.coco_anno_box_to_center_relative(target['bbox'], img_height, img_width) + box = box_utils.coco_anno_box_to_center_relative( + target['bbox'], img_height, img_width) if box[2] <= 0 and box[3] <= 0: continue img['gt_id'][gt_index] = np.int32(target['id']) img['gt_boxes'][gt_index] = box - img['gt_labels'][gt_index] = self.category_to_id_map[target['category_id']] + img['gt_labels'][gt_index] = \ + self.category_to_id_map[target['category_id']] gt_index += 1 if gt_index >= cfg.max_box_num: break @@ -136,10 +143,18 @@ class DataSetReader(object): else: return self._parse_images(is_train=(mode=='train')) - def get_reader(self, mode, size=416, batch_size=None, shuffle=False, mixup_iter=0, random_sizes=[], image=None): + def get_reader(self, + mode, + size=416, + batch_size=None, + shuffle=False, + mixup_iter=0, + random_sizes=[], + image=None): assert mode in ['train', 'test', 'infer'], "Unknow mode type!" if mode != 'infer': - assert batch_size is not None, "batch size connot be None in mode {}".format(mode) + assert batch_size is not None, \ + "batch size connot be None in mode {}".format(mode) self._parse_dataset_dir(mode) self._parse_dataset_catagory() @@ -151,7 +166,9 @@ class DataSetReader(object): h, w, _ = im.shape im_scale_x = size / float(w) im_scale_y = size / float(h) - out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC) + out_img = cv2.resize(im, None, None, + fx=im_scale_x, fy=im_scale_y, + interpolation=cv2.INTER_CUBIC) mean = np.array(mean).reshape((1, 1, -1)) std = np.array(std).reshape((1, 1, -1)) out_img = (out_img / 255.0 - mean) / std @@ -173,11 +190,14 @@ class DataSetReader(object): mixup_gt_boxes = np.array(mixup_img['gt_boxes']).copy() mixup_gt_labels = np.array(mixup_img['gt_labels']).copy() mixup_gt_scores = np.ones_like(mixup_gt_labels) - im, gt_boxes, gt_labels, gt_scores = image_utils.image_mixup(im, gt_boxes, \ - gt_labels, gt_scores, mixup_im, mixup_gt_boxes, mixup_gt_labels, \ - mixup_gt_scores) + im, gt_boxes, gt_labels, gt_scores = \ + image_utils.image_mixup(im, gt_boxes, gt_labels, + gt_scores, mixup_im, mixup_gt_boxes, + mixup_gt_labels, mixup_gt_scores) - im, gt_boxes, gt_labels, gt_scores = image_utils.image_augment(im, gt_boxes, gt_labels, gt_scores, size, mean) + im, gt_boxes, gt_labels, gt_scores = \ + image_utils.image_augment(im, gt_boxes, gt_labels, + gt_scores, size, mean) mean = np.array(mean).reshape((1, 1, -1)) std = np.array(std).reshape((1, 1, -1)) @@ -214,7 +234,9 @@ class DataSetReader(object): read_cnt += 1 if read_cnt % len(imgs) == 0 and shuffle: np.random.shuffle(imgs) - im, gt_boxes, gt_labels, gt_scores = img_reader_with_augment(img, img_size, cfg.pixel_means, cfg.pixel_stds, mixup_img) + im, gt_boxes, gt_labels, gt_scores = \ + img_reader_with_augment(img, img_size, cfg.pixel_means, + cfg.pixel_stds, mixup_img) batch_out.append([im, gt_boxes, gt_labels, gt_scores]) if len(batch_out) == batch_size: @@ -227,7 +249,9 @@ class DataSetReader(object): imgs = self._parse_images_by_mode(mode) batch_out = [] for img in imgs: - im, im_id, im_shape = img_reader(img, size, cfg.pixel_means, cfg.pixel_stds) + im, im_id, im_shape = img_reader(img, size, + cfg.pixel_means, + cfg.pixel_stds) batch_out.append((im, im_id, im_shape)) if len(batch_out) == batch_size: yield batch_out @@ -238,7 +262,9 @@ class DataSetReader(object): img = {} img['image'] = image img['id'] = 0 - im, im_id, im_shape = img_reader(img, size, cfg.pixel_means, cfg.pixel_stds) + im, im_id, im_shape = img_reader(img, size, + cfg.pixel_means, + cfg.pixel_stds) batch_out = [(im, im_id, im_shape)] yield batch_out @@ -256,7 +282,8 @@ def train(size=416, num_workers=8, max_queue=32, use_multiprocessing=True): - generator = dsr.get_reader('train', size, batch_size, shuffle, int(mixup_iter/num_workers), random_sizes) + generator = dsr.get_reader('train', size, batch_size, shuffle, + int(mixup_iter/num_workers), random_sizes) if not use_multiprocessing: return generator diff --git a/PaddleCV/yolov3/train.py b/PaddleCV/yolov3/train.py index 01394c80c7765de1b3baa78e9fc1868e1a6fae73..310b9926db2be4667ae14f8b4d7c1c584f0111c1 100644 --- a/PaddleCV/yolov3/train.py +++ b/PaddleCV/yolov3/train.py @@ -90,7 +90,13 @@ def train(): total_iter = cfg.max_iter - cfg.start_iter mixup_iter = total_iter - cfg.no_mixup_iter - train_reader = reader.train(input_size, batch_size=cfg.batch_size, shuffle=True, total_iter=total_iter*devices_num, mixup_iter=mixup_iter*devices_num, random_sizes=random_sizes, use_multiprocessing=cfg.use_multiprocess) + train_reader = reader.train(input_size, + batch_size=cfg.batch_size, + shuffle=True, + total_iter=total_iter*devices_num, + mixup_iter=mixup_iter*devices_num, + random_sizes=random_sizes, + use_multiprocessing=cfg.use_multiprocess) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) @@ -112,21 +118,25 @@ def train(): for iter_id in range(cfg.start_iter, cfg.max_iter): prev_start_time = start_time start_time = time.time() - losses = exe.run(compile_program, fetch_list=[v.name for v in fetch_list]) + losses = exe.run(compile_program, + fetch_list=[v.name for v in fetch_list]) smoothed_loss.add_value(np.mean(np.array(losses[0]))) snapshot_loss += np.mean(np.array(losses[0])) snapshot_time += start_time - prev_start_time lr = np.array(fluid.global_scope().find_var('learning_rate') .get_tensor()) print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format( - iter_id, lr[0], - smoothed_loss.get_mean_value(), start_time - prev_start_time)) + iter_id, lr[0], + smoothed_loss.get_mean_value(), + start_time - prev_start_time)) sys.stdout.flush() if (iter_id + 1) % cfg.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) - print("Snapshot {} saved, average loss: {}, average time: {}".format( - iter_id + 1, snapshot_loss / float(cfg.snapshot_iter), - snapshot_time / float(cfg.snapshot_iter))) + print("Snapshot {} saved, average loss: {}, \ + average time: {}".format( + iter_id + 1, + snapshot_loss / float(cfg.snapshot_iter), + snapshot_time / float(cfg.snapshot_iter))) snapshot_loss = 0 snapshot_time = 0 except fluid.core.EOFException: diff --git a/PaddleCV/yolov3/utility.py b/PaddleCV/yolov3/utility.py index d28f6a862d43e9e81f7b9b260fd25dd02e2d996c..e43ac033facc8ca12f7cf46c01423be80f17a91e 100644 --- a/PaddleCV/yolov3/utility.py +++ b/PaddleCV/yolov3/utility.py @@ -101,27 +101,30 @@ def parse_args(): add_arg('dataset', str, 'coco2017', "Dataset: coco2014, coco2017.") add_arg('class_num', int, 80, "Class number.") add_arg('data_dir', str, 'dataset/coco', "The data root path.") - add_arg('start_iter', int, 0, "Start iteration.") - add_arg('use_multiprocess', bool, True, "add multiprocess.") + add_arg('start_iter', int, 0, "Start iteration.") + add_arg('use_multiprocess', bool, True, "add multiprocess.") #SOLVER - add_arg('batch_size', int, 8, "Mini-batch size per device.") - add_arg('learning_rate', float, 0.001, "Learning rate.") - add_arg('max_iter', int, 500200, "Iter number.") - add_arg('snapshot_iter', int, 2000, "Save model every snapshot stride.") - add_arg('label_smooth', bool, True, "Use label smooth in class label.") - add_arg('no_mixup_iter', int, 40000, "Disable mixup in last N iter.") + add_arg('batch_size', int, 8, "Mini-batch size per device.") + add_arg('learning_rate', float, 0.001, "Learning rate.") + add_arg('max_iter', int, 500200, "Iter number.") + add_arg('snapshot_iter', int, 2000, "Save model every snapshot stride.") + add_arg('label_smooth', bool, True, "Use label smooth in class label.") + add_arg('no_mixup_iter', int, 40000, "Disable mixup in last N iter.") # TRAIN TEST INFER add_arg('input_size', int, 608, "Image input size of YOLOv3.") - add_arg('random_shape', bool, True, "Resize to random shape for train reader.") - add_arg('valid_thresh', float, 0.005, "Valid confidence score for NMS.") - add_arg('nms_thresh', float, 0.45, "NMS threshold.") + add_arg('random_shape', bool, True, "Resize to random shape for train reader.") + add_arg('valid_thresh', float, 0.005, "Valid confidence score for NMS.") + add_arg('nms_thresh', float, 0.45, "NMS threshold.") add_arg('nms_topk', int, 400, "The number of boxes to perform NMS.") add_arg('nms_posk', int, 100, "The number of boxes of NMS output.") - add_arg('debug', bool, False, "Debug mode") + add_arg('debug', bool, False, "Debug mode") # SINGLE EVAL AND DRAW - add_arg('image_path', str, 'image', "The image path used to inference and visualize.") - add_arg('image_name', str, None, "The single image used to inference and visualize. None to inference all images in image_path") - add_arg('draw_thresh', float, 0.5, "Confidence score threshold to draw prediction box in image in debug mode") + add_arg('image_path', str, 'image', + "The image path used to inference and visualize.") + add_arg('image_name', str, None, + "The single image used to inference and visualize. None to inference all images in image_path") + add_arg('draw_thresh', float, 0.5, + "Confidence score threshold to draw prediction box in image in debug mode") # yapf: enable args = parser.parse_args() file_name = sys.argv[0]