未验证 提交 0e45394c 编写于 作者: K Kaipeng Deng 提交者: GitHub

Merge pull request #2072 from heavengate/fix_yolo_param_pick

[cherry-pick] refine yolov3 param name
......@@ -142,6 +142,8 @@ Evalutaion result is shown as below:
| 416x416 | 36.5 | 58.2 | 39.1 |
| 320x320 | 34.1 | 55.4 | 36.3 |
- **NOTE:** evaluations based on `pycocotools` evaluator, predict bounding boxes with `score < 0.05` were not filtered out. Some frameworks which filtered out predict bounding boxes with `score < 0.05` will cause a drop in accuracy.
## Inference and Visualization
Inference is used to get prediction score or image features based on trained models. `infer.py` is the main executor for inference, one can start infer step by:
......
......@@ -142,6 +142,8 @@ Train Loss
| 416x416 | 36.5 | 58.2 | 39.1 |
| 320x320 | 34.1 | 55.4 | 36.3 |
- **注意: **评估结果基于`pycocotools`评估器,没有滤除`score < 0.05`的预测框,其他框架有此滤除操作会导致精度下降。
## 模型推断及可视化
......
......@@ -51,7 +51,14 @@ def random_distort(img):
return img
def random_crop(img, boxes, labels, scores, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50):
def random_crop(img,
boxes,
labels,
scores,
scales=[0.3, 1.0],
max_ratio=2.0,
constraints=None,
max_trial=50):
if len(boxes) == 0:
return img, boxes
......@@ -90,10 +97,12 @@ def random_crop(img, boxes, labels, scores, scales=[0.3, 1.0], max_ratio=2.0, co
while crops:
crop = crops.pop(np.random.randint(0, len(crops)))
crop_boxes, crop_labels, crop_scores, box_num = box_utils.box_crop(boxes, labels, scores, crop, (w, h))
crop_boxes, crop_labels, crop_scores, box_num = \
box_utils.box_crop(boxes, labels, scores, crop, (w, h))
if box_num < 1:
continue
img = img.crop((crop[0], crop[1], crop[0] + crop[2], crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
img = img.crop((crop[0], crop[1], crop[0] + crop[2],
crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
img = np.asarray(img)
return img, crop_boxes, crop_labels, crop_scores
img = np.asarray(img)
......@@ -118,10 +127,16 @@ def random_interp(img, size, interp=None):
h, w, _ = img.shape
im_scale_x = size / float(w)
im_scale_y = size / float(h)
img = cv2.resize(img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=interp)
img = cv2.resize(img, None, None, fx=im_scale_x, fy=im_scale_y,
interpolation=interp)
return img
def random_expand(img, gtboxes, max_ratio=4., fill=None, keep_ratio=True, thresh=0.5):
def random_expand(img,
gtboxes,
max_ratio=4.,
fill=None,
keep_ratio=True,
thresh=0.5):
if random.random() > thresh:
return img, gtboxes
......@@ -153,13 +168,21 @@ def random_expand(img, gtboxes, max_ratio=4., fill=None, keep_ratio=True, thresh
return out_img.astype('uint8'), gtboxes
def shuffle_gtbox(gtbox, gtlabel, gtscore):
gt = np.concatenate([gtbox, gtlabel[:, np.newaxis], gtscore[:, np.newaxis]], axis=1)
gt = np.concatenate([gtbox, gtlabel[:, np.newaxis],
gtscore[:, np.newaxis]], axis=1)
idx = np.arange(gt.shape[0])
np.random.shuffle(idx)
gt = gt[idx, :]
return gt[:, :4], gt[:, 4], gt[:, 5]
def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2, gtscores2):
def image_mixup(img1,
gtboxes1,
gtlabels1,
gtscores1,
img2,
gtboxes2,
gtlabels2,
gtscores2):
factor = np.random.beta(1.5, 1.5)
factor = max(0.0, min(1.0, factor))
if factor >= 1.0:
......@@ -173,7 +196,8 @@ def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2,
w = max(img1.shape[1], img2.shape[1])
img = np.zeros((h, w, img1.shape[2]), 'float32')
img[:img1.shape[0], :img1.shape[1], :] = img1.astype('float32') * factor
img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1.0 - factor)
img[:img2.shape[0], :img2.shape[1], :] += \
img2.astype('float32') * (1.0 - factor)
gtboxes = np.zeros_like(gtboxes1)
gtlabels = np.zeros_like(gtlabels1)
gtscores = np.zeros_like(gtscores1)
......@@ -208,7 +232,8 @@ def image_mixup(img1, gtboxes1, gtlabels1, gtscores1, img2, gtboxes2, gtlabels2,
def image_augment(img, gtboxes, gtlabels, gtscores, size, means=None):
img = random_distort(img)
img, gtboxes = random_expand(img, gtboxes, fill=means)
img, gtboxes, gtlabels, gtscores = random_crop(img, gtboxes, gtlabels, gtscores)
img, gtboxes, gtlabels, gtscores = \
random_crop(img, gtboxes, gtlabels, gtscores)
img = random_interp(img, size)
img, gtboxes = random_flip(img, gtboxes)
gtboxes, gtlabels, gtscores = shuffle_gtbox(gtboxes, gtlabels, gtscores)
......
......@@ -55,7 +55,13 @@ def conv_bn_layer(input,
out = fluid.layers.leaky_relu(x=out, alpha=0.1)
return out
def downsample(input, ch_out, filter_size=3, stride=2, padding=1, is_test=True, name=None):
def downsample(input,
ch_out,
filter_size=3,
stride=2,
padding=1,
is_test=True,
name=None):
return conv_bn_layer(input,
ch_out=ch_out,
filter_size=filter_size,
......@@ -65,15 +71,19 @@ def downsample(input, ch_out, filter_size=3, stride=2, padding=1, is_test=True,
name=name)
def basicblock(input, ch_out, is_test=True, name=None):
conv1 = conv_bn_layer(input, ch_out, 1, 1, 0, is_test=is_test, name=name+".0")
conv2 = conv_bn_layer(conv1, ch_out*2, 3, 1, 1, is_test=is_test, name=name+".1")
conv1 = conv_bn_layer(input, ch_out, 1, 1, 0,
is_test=is_test, name=name+".0")
conv2 = conv_bn_layer(conv1, ch_out*2, 3, 1, 1,
is_test=is_test, name=name+".1")
out = fluid.layers.elementwise_add(x=input, y=conv2, act=None)
return out
def layer_warp(block_func, input, ch_out, count, is_test=True, name=None):
res_out = block_func(input, ch_out, is_test=is_test, name='{}.0'.format(name))
res_out = block_func(input, ch_out, is_test=is_test,
name='{}.0'.format(name))
for j in range(1, count):
res_out = block_func(res_out, ch_out, is_test=is_test, name='{}.{}'.format(name, j))
res_out = block_func(res_out, ch_out, is_test=is_test,
name='{}.{}'.format(name, j))
return res_out
DarkNet_cfg = {
......@@ -83,14 +93,21 @@ DarkNet_cfg = {
def add_DarkNet53_conv_body(body_input, is_test=True):
stages, block_func = DarkNet_cfg[53]
stages = stages[0:5]
conv1 = conv_bn_layer(
body_input, ch_out=32, filter_size=3, stride=1, padding=1, is_test=is_test, name="yolo_input")
downsample_ = downsample(conv1, ch_out=conv1.shape[1]*2, is_test=is_test, name="yolo_input.downsample")
conv1 = conv_bn_layer(body_input, ch_out=32, filter_size=3,
stride=1, padding=1, is_test=is_test,
name="yolo_input")
downsample_ = downsample(conv1, ch_out=conv1.shape[1]*2,
is_test=is_test,
name="yolo_input.downsample")
blocks = []
for i, stage in enumerate(stages):
block = layer_warp(block_func, downsample_, 32 *(2**i), stage, is_test=is_test, name="stage.{}".format(i))
block = layer_warp(block_func, downsample_, 32 *(2**i),
stage, is_test=is_test,
name="stage.{}".format(i))
blocks.append(block)
if i < len(stages) - 1: # do not downsaple in the last stage
downsample_ = downsample(block, ch_out=block.shape[1]*2, is_test=is_test, name="stage.{}.downsample".format(i))
downsample_ = downsample(block, ch_out=block.shape[1]*2,
is_test=is_test,
name="stage.{}.downsample".format(i))
return blocks[-1:-4:-1]
......@@ -27,13 +27,22 @@ from .darknet import add_DarkNet53_conv_body
from .darknet import conv_bn_layer
def yolo_detection_block(input, channel, is_test=True, name=None):
assert channel % 2 == 0, "channel {} cannot be divided by 2".format(channel)
assert channel % 2 == 0, \
"channel {} cannot be divided by 2".format(channel)
conv = input
for j in range(2):
conv = conv_bn_layer(conv, channel, filter_size=1, stride=1, padding=0, is_test=is_test, name='{}.{}.0'.format(name, j))
conv = conv_bn_layer(conv, channel*2, filter_size=3, stride=1, padding=1, is_test=is_test, name='{}.{}.1'.format(name, j))
route = conv_bn_layer(conv, channel, filter_size=1, stride=1, padding=0, is_test=is_test, name='{}.2'.format(name))
tip = conv_bn_layer(route,channel*2, filter_size=3, stride=1, padding=1, is_test=is_test, name='{}.tip'.format(name))
conv = conv_bn_layer(conv, channel, filter_size=1,
stride=1, padding=0, is_test=is_test,
name='{}.{}.0'.format(name, j))
conv = conv_bn_layer(conv, channel*2, filter_size=3,
stride=1, padding=1, is_test=is_test,
name='{}.{}.1'.format(name, j))
route = conv_bn_layer(conv, channel, filter_size=1, stride=1,
padding=0, is_test=is_test,
name='{}.2'.format(name))
tip = conv_bn_layer(route,channel*2, filter_size=3, stride=1,
padding=1, is_test=is_test,
name='{}.tip'.format(name))
return route, tip
def upsample(input, scale=2,name=None):
......@@ -68,11 +77,15 @@ class YOLOv3(object):
if self.is_train:
self.py_reader = fluid.layers.py_reader(
capacity=64,
shapes = [[-1] + self.image_shape, [-1, cfg.max_box_num, 4], [-1, cfg.max_box_num], [-1, cfg.max_box_num]],
shapes = [[-1] + self.image_shape,
[-1, cfg.max_box_num, 4],
[-1, cfg.max_box_num],
[-1, cfg.max_box_num]],
lod_levels=[0, 0, 0, 0],
dtypes=['float32'] * 2 + ['int32'] + ['float32'],
use_double_buffer=True)
self.image, self.gtbox, self.gtlabel, self.gtscore = fluid.layers.read_file(self.py_reader)
self.image, self.gtbox, self.gtlabel, self.gtscore = \
fluid.layers.read_file(self.py_reader)
else:
self.image = fluid.layers.data(
name='image', shape=self.image_shape, dtype='float32'
......@@ -139,9 +152,9 @@ class YOLOv3(object):
if self.is_train:
loss = fluid.layers.yolov3_loss(
x=out,
gtbox=self.gtbox,
gtlabel=self.gtlabel,
gtscore=self.gtscore,
gt_box=self.gtbox,
gt_label=self.gtlabel,
gt_score=self.gtscore,
anchors=cfg.anchors,
anchor_mask=anchor_mask,
class_num=cfg.class_num,
......
......@@ -53,13 +53,17 @@ class DataSetReader(object):
cfg.dataset))
if mode == 'train':
cfg.train_file_list = os.path.join(cfg.data_dir, cfg.train_file_list)
cfg.train_data_dir = os.path.join(cfg.data_dir, cfg.train_data_dir)
cfg.train_file_list = os.path.join(cfg.data_dir,
cfg.train_file_list)
cfg.train_data_dir = os.path.join(cfg.data_dir,
cfg.train_data_dir)
self.COCO = COCO(cfg.train_file_list)
self.img_dir = cfg.train_data_dir
elif mode == 'test' or mode == 'infer':
cfg.val_file_list = os.path.join(cfg.data_dir, cfg.val_file_list)
cfg.val_data_dir = os.path.join(cfg.data_dir, cfg.val_data_dir)
cfg.val_file_list = os.path.join(cfg.data_dir,
cfg.val_file_list)
cfg.val_data_dir = os.path.join(cfg.data_dir,
cfg.val_data_dir)
self.COCO = COCO(cfg.val_file_list)
self.img_dir = cfg.val_data_dir
......@@ -88,7 +92,8 @@ class DataSetReader(object):
def _parse_gt_annotations(self, img):
img_height = img['height']
img_width = img['width']
anno = self.COCO.loadAnns(self.COCO.getAnnIds(imgIds=img['id'], iscrowd=None))
anno = self.COCO.loadAnns(
self.COCO.getAnnIds(imgIds=img['id'], iscrowd=None))
gt_index = 0
for target in anno:
if target['area'] < cfg.gt_min_area:
......@@ -96,13 +101,15 @@ class DataSetReader(object):
if 'ignore' in target and target['ignore']:
continue
box = box_utils.coco_anno_box_to_center_relative(target['bbox'], img_height, img_width)
box = box_utils.coco_anno_box_to_center_relative(
target['bbox'], img_height, img_width)
if box[2] <= 0 and box[3] <= 0:
continue
img['gt_id'][gt_index] = np.int32(target['id'])
img['gt_boxes'][gt_index] = box
img['gt_labels'][gt_index] = self.category_to_id_map[target['category_id']]
img['gt_labels'][gt_index] = \
self.category_to_id_map[target['category_id']]
gt_index += 1
if gt_index >= cfg.max_box_num:
break
......@@ -136,10 +143,18 @@ class DataSetReader(object):
else:
return self._parse_images(is_train=(mode=='train'))
def get_reader(self, mode, size=416, batch_size=None, shuffle=False, mixup_iter=0, random_sizes=[], image=None):
def get_reader(self,
mode,
size=416,
batch_size=None,
shuffle=False,
mixup_iter=0,
random_sizes=[],
image=None):
assert mode in ['train', 'test', 'infer'], "Unknow mode type!"
if mode != 'infer':
assert batch_size is not None, "batch size connot be None in mode {}".format(mode)
assert batch_size is not None, \
"batch size connot be None in mode {}".format(mode)
self._parse_dataset_dir(mode)
self._parse_dataset_catagory()
......@@ -151,7 +166,9 @@ class DataSetReader(object):
h, w, _ = im.shape
im_scale_x = size / float(w)
im_scale_y = size / float(h)
out_img = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=cv2.INTER_CUBIC)
out_img = cv2.resize(im, None, None,
fx=im_scale_x, fy=im_scale_y,
interpolation=cv2.INTER_CUBIC)
mean = np.array(mean).reshape((1, 1, -1))
std = np.array(std).reshape((1, 1, -1))
out_img = (out_img / 255.0 - mean) / std
......@@ -173,11 +190,14 @@ class DataSetReader(object):
mixup_gt_boxes = np.array(mixup_img['gt_boxes']).copy()
mixup_gt_labels = np.array(mixup_img['gt_labels']).copy()
mixup_gt_scores = np.ones_like(mixup_gt_labels)
im, gt_boxes, gt_labels, gt_scores = image_utils.image_mixup(im, gt_boxes, \
gt_labels, gt_scores, mixup_im, mixup_gt_boxes, mixup_gt_labels, \
mixup_gt_scores)
im, gt_boxes, gt_labels, gt_scores = \
image_utils.image_mixup(im, gt_boxes, gt_labels,
gt_scores, mixup_im, mixup_gt_boxes,
mixup_gt_labels, mixup_gt_scores)
im, gt_boxes, gt_labels, gt_scores = image_utils.image_augment(im, gt_boxes, gt_labels, gt_scores, size, mean)
im, gt_boxes, gt_labels, gt_scores = \
image_utils.image_augment(im, gt_boxes, gt_labels,
gt_scores, size, mean)
mean = np.array(mean).reshape((1, 1, -1))
std = np.array(std).reshape((1, 1, -1))
......@@ -214,7 +234,9 @@ class DataSetReader(object):
read_cnt += 1
if read_cnt % len(imgs) == 0 and shuffle:
np.random.shuffle(imgs)
im, gt_boxes, gt_labels, gt_scores = img_reader_with_augment(img, img_size, cfg.pixel_means, cfg.pixel_stds, mixup_img)
im, gt_boxes, gt_labels, gt_scores = \
img_reader_with_augment(img, img_size, cfg.pixel_means,
cfg.pixel_stds, mixup_img)
batch_out.append([im, gt_boxes, gt_labels, gt_scores])
if len(batch_out) == batch_size:
......@@ -227,7 +249,9 @@ class DataSetReader(object):
imgs = self._parse_images_by_mode(mode)
batch_out = []
for img in imgs:
im, im_id, im_shape = img_reader(img, size, cfg.pixel_means, cfg.pixel_stds)
im, im_id, im_shape = img_reader(img, size,
cfg.pixel_means,
cfg.pixel_stds)
batch_out.append((im, im_id, im_shape))
if len(batch_out) == batch_size:
yield batch_out
......@@ -238,7 +262,9 @@ class DataSetReader(object):
img = {}
img['image'] = image
img['id'] = 0
im, im_id, im_shape = img_reader(img, size, cfg.pixel_means, cfg.pixel_stds)
im, im_id, im_shape = img_reader(img, size,
cfg.pixel_means,
cfg.pixel_stds)
batch_out = [(im, im_id, im_shape)]
yield batch_out
......@@ -256,7 +282,8 @@ def train(size=416,
num_workers=8,
max_queue=32,
use_multiprocessing=True):
generator = dsr.get_reader('train', size, batch_size, shuffle, int(mixup_iter/num_workers), random_sizes)
generator = dsr.get_reader('train', size, batch_size, shuffle,
int(mixup_iter/num_workers), random_sizes)
if not use_multiprocessing:
return generator
......
......@@ -90,7 +90,13 @@ def train():
total_iter = cfg.max_iter - cfg.start_iter
mixup_iter = total_iter - cfg.no_mixup_iter
train_reader = reader.train(input_size, batch_size=cfg.batch_size, shuffle=True, total_iter=total_iter*devices_num, mixup_iter=mixup_iter*devices_num, random_sizes=random_sizes, use_multiprocessing=cfg.use_multiprocess)
train_reader = reader.train(input_size,
batch_size=cfg.batch_size,
shuffle=True,
total_iter=total_iter*devices_num,
mixup_iter=mixup_iter*devices_num,
random_sizes=random_sizes,
use_multiprocessing=cfg.use_multiprocess)
py_reader = model.py_reader
py_reader.decorate_paddle_reader(train_reader)
......@@ -112,7 +118,8 @@ def train():
for iter_id in range(cfg.start_iter, cfg.max_iter):
prev_start_time = start_time
start_time = time.time()
losses = exe.run(compile_program, fetch_list=[v.name for v in fetch_list])
losses = exe.run(compile_program,
fetch_list=[v.name for v in fetch_list])
smoothed_loss.add_value(np.mean(np.array(losses[0])))
snapshot_loss += np.mean(np.array(losses[0]))
snapshot_time += start_time - prev_start_time
......@@ -120,12 +127,15 @@ def train():
.get_tensor())
print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
iter_id, lr[0],
smoothed_loss.get_mean_value(), start_time - prev_start_time))
smoothed_loss.get_mean_value(),
start_time - prev_start_time))
sys.stdout.flush()
if (iter_id + 1) % cfg.snapshot_iter == 0:
save_model("model_iter{}".format(iter_id))
print("Snapshot {} saved, average loss: {}, average time: {}".format(
iter_id + 1, snapshot_loss / float(cfg.snapshot_iter),
print("Snapshot {} saved, average loss: {}, \
average time: {}".format(
iter_id + 1,
snapshot_loss / float(cfg.snapshot_iter),
snapshot_time / float(cfg.snapshot_iter)))
snapshot_loss = 0
snapshot_time = 0
......
......@@ -119,9 +119,12 @@ def parse_args():
add_arg('nms_posk', int, 100, "The number of boxes of NMS output.")
add_arg('debug', bool, False, "Debug mode")
# SINGLE EVAL AND DRAW
add_arg('image_path', str, 'image', "The image path used to inference and visualize.")
add_arg('image_name', str, None, "The single image used to inference and visualize. None to inference all images in image_path")
add_arg('draw_thresh', float, 0.5, "Confidence score threshold to draw prediction box in image in debug mode")
add_arg('image_path', str, 'image',
"The image path used to inference and visualize.")
add_arg('image_name', str, None,
"The single image used to inference and visualize. None to inference all images in image_path")
add_arg('draw_thresh', float, 0.5,
"Confidence score threshold to draw prediction box in image in debug mode")
# yapf: enable
args = parser.parse_args()
file_name = sys.argv[0]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册