diff --git a/FPS_test.py b/FPS_test.py index 7f024e6cf6b12516ff39a20401b5076e86488657..105b43a95ba31c1fc7a46a6c45bd70817797363c 100644 --- a/FPS_test.py +++ b/FPS_test.py @@ -25,20 +25,20 @@ video.py里面测试的FPS会低于该FPS,因为摄像头的读取频率有限 ''' class FPS_YOLO(YOLO): def get_FPS(self, image, test_interval): - # 调整图片使其符合输入要求 image_shape = np.array(np.shape(image)[0:2]) - + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + #---------------------------------------------------------# crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - photo = np.array(crop_img,dtype = np.float32) - photo /= 255.0 + photo = np.array(crop_img,dtype = np.float32) / 255.0 photo = np.transpose(photo, (2, 0, 1)) - photo = photo.astype(np.float32) - images = [] - images.append(photo) - images = np.asarray(images) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + images = [photo] with torch.no_grad(): - images = torch.from_numpy(images) + images = torch.from_numpy(np.asarray(images)) if self.cuda: images = images.cuda() outputs = self.net(images) diff --git a/VOCdevkit/VOC2007/voc2yolo3.py b/VOCdevkit/VOC2007/voc2yolo3.py index 672eb48f75b95a74ed53433e3022597f94f0c6a4..c1eadf8a0d8c13e8f5f7a4712223e876b491787b 100644 --- a/VOCdevkit/VOC2007/voc2yolo3.py +++ b/VOCdevkit/VOC2007/voc2yolo3.py @@ -1,10 +1,18 @@ + +#----------------------------------------------------------------------# +# 验证集的划分在train.py代码里面进行 +# test.txt和val.txt里面没有内容是正常的。训练不会使用到。 +#----------------------------------------------------------------------# import os import random -random.seed(0) - + xmlfilepath=r'./VOCdevkit/VOC2007/Annotations' saveBasePath=r"./VOCdevkit/VOC2007/ImageSets/Main/" +#----------------------------------------------------------------------# +# 想要增加测试集修改trainval_percent +# train_percent不需要修改 +#----------------------------------------------------------------------# trainval_percent=1 train_percent=1 diff --git a/get_dr_txt.py b/get_dr_txt.py index f92bcd0faea88b565f0d76f8cec54ee5a8b02e44..177d53d9d642bf75f1d00a812d1421f2a10c22d8 100644 --- a/get_dr_txt.py +++ b/get_dr_txt.py @@ -1,22 +1,26 @@ -#-------------------------------------# -# mAP所需文件计算代码 -# 具体教程请查看Bilibili -# Bubbliiiing -#-------------------------------------# -import cv2 -import numpy as np +#----------------------------------------------------# +# 获取测试集的detection-result和images-optional +# 具体视频教程可查看 +# https://www.bilibili.com/video/BV1zE411u7Vw +#----------------------------------------------------# import colorsys import os + +import cv2 +import numpy as np import torch -import torch.nn as nn import torch.backends.cudnn as cudnn +import torch.nn as nn +from PIL import Image, ImageDraw, ImageFont from torch.autograd import Variable -from yolo import YOLO +from tqdm import tqdm + from nets.yolo3 import YoloBody -from PIL import Image,ImageFont, ImageDraw from utils.config import Config -from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes -from tqdm import tqdm +from utils.utils import (DecodeBox, bbox_iou, letterbox_image, + non_max_suppression, yolo_correct_boxes) +from yolo import YOLO + class mAP_Yolo(YOLO): #---------------------------------------------------# @@ -28,40 +32,61 @@ class mAP_Yolo(YOLO): f = open("./input/detection-results/"+image_id+".txt","w") image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + #---------------------------------------------------------# crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - photo = np.array(crop_img,dtype = np.float32) - photo /= 255.0 + photo = np.array(crop_img,dtype = np.float32) / 255.0 photo = np.transpose(photo, (2, 0, 1)) - photo = photo.astype(np.float32) - images = [] - images.append(photo) - - images = np.asarray(images) - images = torch.from_numpy(images) - if self.cuda: - images = images.cuda() - + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + images = [photo] + with torch.no_grad(): + images = torch.from_numpy(np.asarray(images)) + if self.cuda: + images = images.cuda() + + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# outputs = self.net(images) output_list = [] for i in range(3): output_list.append(self.yolo_decodes[i](outputs[i])) + + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# output = torch.cat(output_list, 1) batch_detections = non_max_suppression(output, self.config["yolo"]["classes"], conf_thres=self.confidence, nms_thres=self.iou) - try : - batch_detections = batch_detections[0].cpu().numpy() - except: - return image - top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence - top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] - top_label = np.array(batch_detections[top_index,-1],np.int32) - top_bboxes = np.array(batch_detections[top_index,:4]) - top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) - - # 去掉灰条 - boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) + + #---------------------------------------------------------# + # 如果没有检测出物体,返回原图 + #---------------------------------------------------------# + try : + batch_detections = batch_detections[0].cpu().numpy() + except: + return image + + #---------------------------------------------------------# + # 对预测框进行得分筛选 + #---------------------------------------------------------# + top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence + top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] + top_label = np.array(batch_detections[top_index,-1],np.int32) + top_bboxes = np.array(batch_detections[top_index,:4]) + top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) + + #-----------------------------------------------------------------# + # 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条 + # 因此生成的top_bboxes是相对于有灰条的图像的 + # 我们需要对其进行修改,去除灰条的部分。 + #-----------------------------------------------------------------# + boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) for i, c in enumerate(top_label): predicted_class = self.class_names[c] diff --git a/nets/darknet.py b/nets/darknet.py index 7d02903c3f33eab3fa2a0493202071ddc6f1287b..71cb9b7575905b1707baaa8a2245d77b10af938a 100644 --- a/nets/darknet.py +++ b/nets/darknet.py @@ -1,9 +1,15 @@ -import torch -import torch.nn as nn import math from collections import OrderedDict -# 基本的darknet块 +import torch +import torch.nn as nn + + +#---------------------------------------------------------------------# +# 残差结构 +# 利用一个1x1卷积下降通道数,然后利用一个3x3卷积提取特征并且上升通道数 +# 最后接上一个残差边 +#---------------------------------------------------------------------# class BasicBlock(nn.Module): def __init__(self, inplanes, planes): super(BasicBlock, self).__init__() @@ -36,14 +42,20 @@ class DarkNet(nn.Module): def __init__(self, layers): super(DarkNet, self).__init__() self.inplanes = 32 + # 416,416,3 -> 416,416,32 self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(self.inplanes) self.relu1 = nn.LeakyReLU(0.1) + # 416,416,32 -> 208,208,64 self.layer1 = self._make_layer([32, 64], layers[0]) + # 208,208,64 -> 104,104,128 self.layer2 = self._make_layer([64, 128], layers[1]) + # 104,104,128 -> 52,52,256 self.layer3 = self._make_layer([128, 256], layers[2]) + # 52,52,256 -> 26,26,512 self.layer4 = self._make_layer([256, 512], layers[3]) + # 26,26,512 -> 13,13,1024 self.layer5 = self._make_layer([512, 1024], layers[4]) self.layers_out_filters = [64, 128, 256, 512, 1024] @@ -57,6 +69,10 @@ class DarkNet(nn.Module): m.weight.data.fill_(1) m.bias.data.zero_() + #---------------------------------------------------------------------# + # 在每一个layer里面,首先利用一个步长为2的3x3卷积进行下采样 + # 然后进行残差结构的堆叠 + #---------------------------------------------------------------------# def _make_layer(self, planes, blocks): layers = [] # 下采样,步长为2,卷积核大小为3 @@ -64,7 +80,7 @@ class DarkNet(nn.Module): stride=2, padding=1, bias=False))) layers.append(("ds_bn", nn.BatchNorm2d(planes[1]))) layers.append(("ds_relu", nn.LeakyReLU(0.1))) - # 加入darknet模块 + # 加入残差结构 self.inplanes = planes[1] for i in range(0, blocks): layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes))) diff --git a/nets/yolo3.py b/nets/yolo3.py index a7aa43bc21c07fb2a7e278f4de12a2362760380a..6916db5e97dcf73dc4a2cc7063df1c1b59bd5c2d 100644 --- a/nets/yolo3.py +++ b/nets/yolo3.py @@ -1,8 +1,11 @@ +from collections import OrderedDict + import torch import torch.nn as nn -from collections import OrderedDict + from nets.darknet import darknet53 + def conv2d(filter_in, filter_out, kernel_size): pad = (kernel_size - 1) // 2 if kernel_size else 0 return nn.Sequential(OrderedDict([ @@ -11,6 +14,10 @@ def conv2d(filter_in, filter_out, kernel_size): ("relu", nn.LeakyReLU(0.1)), ])) +#------------------------------------------------------------------------# +# make_last_layers里面一共有七个卷积,前五个用于提取特征。 +# 后两个用于获得yolo网络的预测结果 +#------------------------------------------------------------------------# def make_last_layers(filters_list, in_filters, out_filter): m = nn.ModuleList([ conv2d(in_filters, filters_list[0], 1), @@ -28,21 +35,30 @@ class YoloBody(nn.Module): def __init__(self, config): super(YoloBody, self).__init__() self.config = config - # backbone + #---------------------------------------------------# + # 生成darknet53的主干模型 + # 获得三个有效特征层,他们的shape分别是: + # 13,13,256 + # 26,26,512 + # 13,13,1024 + #---------------------------------------------------# self.backbone = darknet53(None) + # out_filters : [64, 128, 256, 512, 1024] out_filters = self.backbone.layers_out_filters - # last_layer0 + + #------------------------------------------------------------------------# + # 计算yolo_head的输出通道数,对于voc数据集而言 + # final_out_filter0 = final_out_filter1 = final_out_filter2 = 75 + #------------------------------------------------------------------------# final_out_filter0 = len(config["yolo"]["anchors"][0]) * (5 + config["yolo"]["classes"]) self.last_layer0 = make_last_layers([512, 1024], out_filters[-1], final_out_filter0) - # embedding1 final_out_filter1 = len(config["yolo"]["anchors"][1]) * (5 + config["yolo"]["classes"]) self.last_layer1_conv = conv2d(512, 256, 1) self.last_layer1_upsample = nn.Upsample(scale_factor=2, mode='nearest') self.last_layer1 = make_last_layers([256, 512], out_filters[-2] + 256, final_out_filter1) - # embedding2 final_out_filter2 = len(config["yolo"]["anchors"][2]) * (5 + config["yolo"]["classes"]) self.last_layer2_conv = conv2d(256, 128, 1) self.last_layer2_upsample = nn.Upsample(scale_factor=2, mode='nearest') @@ -56,21 +72,43 @@ class YoloBody(nn.Module): if i == 4: out_branch = layer_in return layer_in, out_branch - # backbone + #---------------------------------------------------# + # 获得三个有效特征层,他们的shape分别是: + # 13,13,256;26,26,512;13,13,1024 + #---------------------------------------------------# x2, x1, x0 = self.backbone(x) - # yolo branch 0 + + #---------------------------------------------------# + # 第一个特征层 + # out0 = (batch_size,255,13,13) + #---------------------------------------------------# + # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 out0, out0_branch = _branch(self.last_layer0, x0) - # yolo branch 1 + # 13,13,512 -> 13,13,256 -> 26,26,256 x1_in = self.last_layer1_conv(out0_branch) x1_in = self.last_layer1_upsample(x1_in) + + # 26,26,256 + 26,26,512 -> 26,26,768 x1_in = torch.cat([x1_in, x1], 1) + #---------------------------------------------------# + # 第二个特征层 + # out1 = (batch_size,255,26,26) + #---------------------------------------------------# + # 26,26,768 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 out1, out1_branch = _branch(self.last_layer1, x1_in) - # yolo branch 2 + # 26,26,256 -> 26,26,128 -> 52,52,128 x2_in = self.last_layer2_conv(out1_branch) x2_in = self.last_layer2_upsample(x2_in) + + # 52,52,128 + 52,52,256 -> 52,52,384 x2_in = torch.cat([x2_in, x2], 1) + #---------------------------------------------------# + # 第一个特征层 + # out3 = (batch_size,255,52,52) + #---------------------------------------------------# + # 52,52,384 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 out2, _ = _branch(self.last_layer2, x2_in) return out0, out1, out2 diff --git a/nets/yolo_training.py b/nets/yolo_training.py index f9e97a5f1b35141dce62dd20f8d05b5da3d1433e..6bc9610d6854726b03be9c655f83b8bf53c45061 100644 --- a/nets/yolo_training.py +++ b/nets/yolo_training.py @@ -1,17 +1,21 @@ -import cv2 +import math from random import shuffle + +import cv2 import numpy as np import torch import torch.nn as nn -import math import torch.nn.functional as F -from matplotlib.colors import rgb_to_hsv, hsv_to_rgb +from matplotlib.colors import hsv_to_rgb, rgb_to_hsv from PIL import Image from utils.utils import bbox_iou + def jaccard(_box_a, _box_b): + # 计算真实框的左上角和右下角 b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2 b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2 + # 计算先验框的左上角和右下角 b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2 b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2 box_a = torch.zeros_like(_box_a) @@ -53,12 +57,21 @@ def BCELoss(pred,target): return output class YOLOLoss(nn.Module): - def __init__(self, anchors, num_classes, img_size, cuda): + def __init__(self, anchors, num_classes, img_size, cuda, normalize): super(YOLOLoss, self).__init__() + #-----------------------------------------------------------# + # 13x13的特征层对应的anchor是[116,90],[156,198],[373,326] + # 26x26的特征层对应的anchor是[30,61],[62,45],[59,119] + # 52x52的特征层对应的anchor是[10,13],[16,30],[33,23] + #-----------------------------------------------------------# self.anchors = anchors self.num_anchors = len(anchors) self.num_classes = num_classes self.bbox_attrs = 5 + num_classes + #-------------------------------------# + # 获得特征层的宽高 + # 13、26、52 + #-------------------------------------# self.feature_length = [img_size[0]//32,img_size[0]//16,img_size[0]//8] self.img_size = img_size @@ -68,60 +81,103 @@ class YOLOLoss(nn.Module): self.lambda_conf = 1.0 self.lambda_cls = 1.0 self.cuda = cuda + self.normalize = normalize def forward(self, input, targets=None): - # input为bs,3*(5+num_classes),13,13 + #----------------------------------------------------# + # input的shape为 bs, 3*(5+num_classes), 13, 13 + # bs, 3*(5+num_classes), 26, 26 + # bs, 3*(5+num_classes), 52, 52 + #----------------------------------------------------# - # 一共多少张图片 + #-----------------------# + # 一共多少张图片 + #-----------------------# bs = input.size(0) - # 特征层的高 + #-----------------------# + # 特征层的高 + #-----------------------# in_h = input.size(2) - # 特征层的宽 + #-----------------------# + # 特征层的宽 + #-----------------------# in_w = input.size(3) - # 计算步长 - # 每一个特征点对应原来的图片上多少个像素点 - # 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点 + #-----------------------------------------------------------------------# + # 计算步长 + # 每一个特征点对应原来的图片上多少个像素点 + # 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点 + # 如果特征层为26x26的话,一个特征点就对应原来的图片上的16个像素点 + # 如果特征层为52x52的话,一个特征点就对应原来的图片上的8个像素点 + # stride_h = stride_w = 32、16、8 + #-----------------------------------------------------------------------# stride_h = self.img_size[1] / in_h stride_w = self.img_size[0] / in_w - # 把先验框的尺寸调整成特征层大小的形式 - # 计算出先验框在特征层上对应的宽高 + #-------------------------------------------------# + # 此时获得的scaled_anchors大小是相对于特征层的 + #-------------------------------------------------# scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors] - # bs,3*(5+num_classes),13,13 -> bs,3,13,13,(5+num_classes) + #-----------------------------------------------# + # 输入的input一共有三个,他们的shape分别是 + # batch_size, 3, 13, 13, 5 + num_classes + # batch_size, 3, 26, 26, 5 + num_classes + # batch_size, 3, 52, 52, 5 + num_classes + #-----------------------------------------------# prediction = input.view(bs, int(self.num_anchors/3), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous() - # 对prediction预测进行调整 - x = torch.sigmoid(prediction[..., 0]) # Center x - y = torch.sigmoid(prediction[..., 1]) # Center y - w = prediction[..., 2] # Width - h = prediction[..., 3] # Height - conf = torch.sigmoid(prediction[..., 4]) # Conf - pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. - - # 找到哪些先验框内部包含物体 + # 先验框的中心位置的调整参数 + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + # 先验框的宽高调整参数 + w = prediction[..., 2] + h = prediction[..., 3] + # 获得置信度,是否有物体 + conf = torch.sigmoid(prediction[..., 4]) + # 种类置信度 + pred_cls = torch.sigmoid(prediction[..., 5:]) + + #---------------------------------------------------------------# + # 找到哪些先验框内部包含物体 + # 利用真实框和先验框计算交并比 + # mask batch_size, 3, in_h, in_w 无目标的特征点 + # noobj_mask batch_size, 3, in_h, in_w 有目标的特征点 + # tx batch_size, 3, in_h, in_w 中心x偏移情况 + # ty batch_size, 3, in_h, in_w 中心y偏移情况 + # tw batch_size, 3, in_h, in_w 宽高调整参数的真实值 + # th batch_size, 3, in_h, in_w 宽高调整参数的真实值 + # tconf batch_size, 3, in_h, in_w 置信度真实值 + # tcls batch_size, 3, in_h, in_w, num_classes 种类真实值 + #----------------------------------------------------------------# mask, noobj_mask, tx, ty, tw, th, tconf, tcls, box_loss_scale_x, box_loss_scale_y =\ self.get_target(targets, scaled_anchors, in_w, in_h, self.ignore_threshold) + #---------------------------------------------------------------# + # 将预测结果进行解码,判断预测结果和真实值的重合程度 + # 如果重合程度过大则忽略,因为这些特征点属于预测比较准确的特征点 + # 作为负样本不合适 + #----------------------------------------------------------------# noobj_mask = self.get_ignore(prediction, targets, scaled_anchors, in_w, in_h, noobj_mask) + if self.cuda: box_loss_scale_x = (box_loss_scale_x).cuda() box_loss_scale_y = (box_loss_scale_y).cuda() mask, noobj_mask = mask.cuda(), noobj_mask.cuda() tx, ty, tw, th = tx.cuda(), ty.cuda(), tw.cuda(), th.cuda() tconf, tcls = tconf.cuda(), tcls.cuda() - box_loss_scale = 2 - box_loss_scale_x*box_loss_scale_y + box_loss_scale = 2 - box_loss_scale_x * box_loss_scale_y - # losses. + # 计算中心偏移情况的loss,使用BCELoss效果好一些 loss_x = torch.sum(BCELoss(x, tx) / bs * box_loss_scale * mask) loss_y = torch.sum(BCELoss(y, ty) / bs * box_loss_scale * mask) + # 计算宽高调整值的loss loss_w = torch.sum(MSELoss(w, tw) / bs * 0.5 * box_loss_scale * mask) loss_h = torch.sum(MSELoss(h, th) / bs * 0.5 * box_loss_scale * mask) - + # 计算置信度的loss loss_conf = torch.sum(BCELoss(conf, mask) * mask / bs) + \ torch.sum(BCELoss(conf, mask) * noobj_mask / bs) @@ -130,19 +186,30 @@ class YOLOLoss(nn.Module): loss = loss_x * self.lambda_xy + loss_y * self.lambda_xy + \ loss_w * self.lambda_wh + loss_h * self.lambda_wh + \ loss_conf * self.lambda_conf + loss_cls * self.lambda_cls + # print(loss, loss_x.item() + loss_y.item(), loss_w.item() + loss_h.item(), # loss_conf.item(), loss_cls.item(), \ # torch.sum(mask),torch.sum(noobj_mask)) - return loss, loss_x.item(), loss_y.item(), loss_w.item(), \ - loss_h.item(), loss_conf.item(), loss_cls.item() + if self.normalize: + num_pos = torch.sum(mask) + num_pos = torch.max(num_pos, torch.ones_like(num_pos)) + else: + num_pos = bs + return loss, num_pos def get_target(self, target, anchors, in_w, in_h, ignore_threshold): - # 计算一共有多少张图片 + #-----------------------------------------------------# + # 计算一共有多少张图片 + #-----------------------------------------------------# bs = len(target) - # 获得先验框 + #-------------------------------------------------------# + # 获得当前特征层先验框所属的编号,方便后面对先验框筛选 + #-------------------------------------------------------# anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] subtract_index = [0,3,6][self.feature_length.index(in_w)] - # 创建全是0或者全是1的阵列 + #-------------------------------------------------------# + # 创建全是0或者全是1的阵列 + #-------------------------------------------------------# mask = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) noobj_mask = torch.ones(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) @@ -158,55 +225,96 @@ class YOLOLoss(nn.Module): for b in range(bs): if len(target[b])==0: continue - # 计算出在特征层上的点位 + #-------------------------------------------------------# + # 计算出正样本在特征层上的中心点 + #-------------------------------------------------------# gxs = target[b][:, 0:1] * in_w gys = target[b][:, 1:2] * in_h + #-------------------------------------------------------# + # 计算出正样本相对于特征层的宽高 + #-------------------------------------------------------# gws = target[b][:, 2:3] * in_w ghs = target[b][:, 3:4] * in_h - # 计算出属于哪个网格 + #-------------------------------------------------------# + # 计算出正样本属于特征层的哪个特征点 + #-------------------------------------------------------# gis = torch.floor(gxs) gjs = torch.floor(gys) - # 计算真实框的位置 + #-------------------------------------------------------# + # 将真实框转换一个形式 + # num_true_box, 4 + #-------------------------------------------------------# gt_box = torch.FloatTensor(torch.cat([torch.zeros_like(gws), torch.zeros_like(ghs), gws, ghs], 1)) - # 计算出所有先验框的位置 + #-------------------------------------------------------# + # 将先验框转换一个形式 + # 9, 4 + #-------------------------------------------------------# anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((self.num_anchors, 2)), torch.FloatTensor(anchors)), 1)) - # 计算重合程度 + #-------------------------------------------------------# + # 计算交并比 + # num_true_box, 9 + #-------------------------------------------------------# anch_ious = jaccard(gt_box, anchor_shapes) - # Find the best matching anchor box + #-------------------------------------------------------# + # 计算重合度最大的先验框是哪个 + # num_true_box, + #-------------------------------------------------------# best_ns = torch.argmax(anch_ious,dim=-1) for i, best_n in enumerate(best_ns): if best_n not in anchor_index: continue - # Masks + #-------------------------------------------------------------# + # 取出各类坐标: + # gi和gj代表的是真实框对应的特征点的x轴y轴坐标 + # gx和gy代表真实框的x轴和y轴坐标 + # gw和gh代表真实框的宽和高 + #-------------------------------------------------------------# gi = gis[i].long() gj = gjs[i].long() gx = gxs[i] gy = gys[i] gw = gws[i] gh = ghs[i] - # Masks + if (gj < in_h) and (gi < in_w): best_n = best_n - subtract_index - # 判定哪些先验框内部真实的存在物体 + + #----------------------------------------# + # noobj_mask代表无目标的特征点 + #----------------------------------------# noobj_mask[b, best_n, gj, gi] = 0 + #----------------------------------------# + # mask代表有目标的特征点 + #----------------------------------------# mask[b, best_n, gj, gi] = 1 - # 计算先验框中心调整参数 + #----------------------------------------# + # tx、ty代表中心调整参数的真实值 + #----------------------------------------# tx[b, best_n, gj, gi] = gx - gi.float() ty[b, best_n, gj, gi] = gy - gj.float() - # 计算先验框宽高调整参数 + #----------------------------------------# + # tw、th代表宽高调整参数的真实值 + #----------------------------------------# tw[b, best_n, gj, gi] = math.log(gw / anchors[best_n+subtract_index][0]) th[b, best_n, gj, gi] = math.log(gh / anchors[best_n+subtract_index][1]) - # 用于获得xywh的比例 + #----------------------------------------# + # 用于获得xywh的比例 + # 大目标loss权重小,小目标loss权重大 + #----------------------------------------# box_loss_scale_x[b, best_n, gj, gi] = target[b][i, 2] box_loss_scale_y[b, best_n, gj, gi] = target[b][i, 3] - # 物体置信度 + #----------------------------------------# + # tconf代表物体置信度 + #----------------------------------------# tconf[b, best_n, gj, gi] = 1 - # 种类 + #----------------------------------------# + # tcls代表种类置信度 + #----------------------------------------# tcls[b, best_n, gj, gi, int(target[b][i, 4])] = 1 else: print('Step {0} out of bound'.format(b)) @@ -216,10 +324,16 @@ class YOLOLoss(nn.Module): return mask, noobj_mask, tx, ty, tw, th, tconf, tcls, box_loss_scale_x, box_loss_scale_y def get_ignore(self,prediction,target,scaled_anchors,in_w, in_h,noobj_mask): + #-----------------------------------------------------# + # 计算一共有多少张图片 + #-----------------------------------------------------# bs = len(target) + #-------------------------------------------------------# + # 获得当前特征层先验框所属的编号,方便后面对先验框筛选 + #-------------------------------------------------------# anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] scaled_anchors = np.array(scaled_anchors)[anchor_index] - # print(scaled_anchors) + # 先验框的中心位置的调整参数 x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) @@ -243,7 +357,9 @@ class YOLOLoss(nn.Module): anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape) - # 计算调整后的先验框中心与宽高 + #-------------------------------------------------------# + # 计算调整后的先验框中心与宽高 + #-------------------------------------------------------# pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y @@ -252,7 +368,15 @@ class YOLOLoss(nn.Module): for i in range(bs): pred_boxes_for_ignore = pred_boxes[i] + #-------------------------------------------------------# + # 将预测结果转换一个形式 + # pred_boxes_for_ignore num_anchors, 4 + #-------------------------------------------------------# pred_boxes_for_ignore = pred_boxes_for_ignore.view(-1, 4) + #-------------------------------------------------------# + # 计算真实框,并把真实框转换成相对于特征层的大小 + # gt_box num_true_box, 4 + #-------------------------------------------------------# if len(target[i]) > 0: gx = target[i][:, 0:1] * in_w gy = target[i][:, 1:2] * in_h @@ -260,11 +384,18 @@ class YOLOLoss(nn.Module): gh = target[i][:, 3:4] * in_h gt_box = torch.FloatTensor(torch.cat([gx, gy, gw, gh],-1)).type(FloatTensor) + #-------------------------------------------------------# + # 计算交并比 + # anch_ious num_true_box, num_anchors + #-------------------------------------------------------# anch_ious = jaccard(gt_box, pred_boxes_for_ignore) + #-------------------------------------------------------# + # 每个先验框对应真实框的最大重合度 + # anch_ious_max num_anchors + #-------------------------------------------------------# anch_ious_max, _ = torch.max(anch_ious,dim=0) anch_ious_max = anch_ious_max.view(pred_boxes[i].size()[:3]) noobj_mask[i][anch_ious_max>self.ignore_threshold] = 0 - # print(torch.max(anch_ious)) return noobj_mask @@ -282,7 +413,7 @@ class Generator(object): self.train_batches = len(train_lines) self.image_size = image_size - def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5): + def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True): '''r实时数据增强的随机预处理''' line = annotation_line.split() image = Image.open(line[0]) @@ -290,6 +421,35 @@ class Generator(object): h, w = input_shape box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) + if not random: + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + dx = (w-nw)//2 + dy = (h-nh)//2 + + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image_data = np.array(new_image, np.float32) + + # 调整目标框坐标 + box_data = np.zeros((len(box), 5)) + if len(box) > 0: + np.random.shuffle(box) + box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx + box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy + box[:, 0:2][box[:, 0:2] < 0] = 0 + box[:, 2][box[:, 2] > w] = w + box[:, 3][box[:, 3] > h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框 + box_data = np.zeros((len(box), 5)) + box_data[:len(box)] = box + + return image_data, box_data + # resize image new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter) scale = rand(.25, 2) @@ -342,13 +502,8 @@ class Generator(object): box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box box_data = np.zeros((len(box),5)) box_data[:len(box)] = box - if len(box) == 0: - return image_data, [] - if (box_data[:,:4]>0).any(): - return image_data, box_data - else: - return image_data, [] + return image_data, box_data def generate(self, train=True): while True: @@ -357,7 +512,10 @@ class Generator(object): inputs = [] targets = [] for annotation_line in lines: - img,y=self.get_random_data(annotation_line,self.image_size[0:2]) + if train: + img,y=self.get_random_data(annotation_line, self.image_size[0:2]) + else: + img,y=self.get_random_data(annotation_line, self.image_size[0:2], False) if len(y)!=0: boxes = np.array(y[:,:4],dtype=np.float32) @@ -373,6 +531,7 @@ class Generator(object): boxes[:,0] = boxes[:,0] + boxes[:,2]/2 boxes[:,1] = boxes[:,1] + boxes[:,3]/2 y = np.concatenate([boxes,y[:,-1:]],axis=-1) + img = np.array(img,dtype = np.float32) inputs.append(np.transpose(img/255.0,(2,0,1))) diff --git a/predict.py b/predict.py index 07c7406a3cab29426d9621d755bca0d0c7c78b50..9dceed406946f7a60aee9b9a45f7e4ba58540c9c 100644 --- a/predict.py +++ b/predict.py @@ -1,9 +1,14 @@ -#-------------------------------------# -# 对单张图片进行预测 -#-------------------------------------# -from yolo import YOLO +''' +predict.py有几个注意点 +1、无法进行批量预测,如果想要批量预测,可以利用os.listdir()遍历文件夹,利用Image.open打开图片文件进行预测。 +2、如果想要保存,利用r_image.save("img.jpg")即可保存。 +3、如果想要获得框的坐标,可以进入detect_image函数,读取top,left,bottom,right这四个值。 +4、如果想要截取下目标,可以利用获取到的top,left,bottom,right这四个值在原图上利用矩阵的方式进行截取。 +''' from PIL import Image +from yolo import YOLO + yolo = YOLO() while True: diff --git a/test.py b/test.py index cc50ad7f4191073ea83064e2d14f3c6418846dff..999fc487982e1eef2b6aac75b6a2e175baf99ac2 100644 --- a/test.py +++ b/test.py @@ -5,6 +5,7 @@ #--------------------------------------------# import torch from torchsummary import summary + from nets.yolo3 import YoloBody from utils.config import Config diff --git a/train.py b/train.py index 44e6521547531124527be2ba38ff03a9c230d9d2..36d50bff21dcde29bf377fc51f921cc81b8a734b 100644 --- a/train.py +++ b/train.py @@ -2,21 +2,24 @@ # 对数据集进行训练 #-------------------------------------# import os -import numpy as np import time + +import numpy as np import torch -from torch.autograd import Variable +import torch.backends.cudnn as cudnn import torch.nn as nn -import torch.optim as optim import torch.nn.functional as F -import torch.backends.cudnn as cudnn -from utils.config import Config +import torch.optim as optim +from torch.autograd import Variable from torch.utils.data import DataLoader -from utils.dataloader import yolo_dataset_collate, YoloDataset -from nets.yolo_training import YOLOLoss,Generator -from nets.yolo3 import YoloBody from tqdm import tqdm +from nets.yolo3 import YoloBody +from nets.yolo_training import Generator, YOLOLoss +from utils.config import Config +from utils.dataloader import YoloDataset, yolo_dataset_collate + + def get_lr(optimizer): for param_group in optimizer.param_groups: return param_group['lr'] @@ -24,7 +27,8 @@ def get_lr(optimizer): def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,cuda): total_loss = 0 val_loss = 0 - start_time = time.time() + + net.train() with tqdm(total=epoch_size,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: for iteration, batch in enumerate(gen): if iteration >= epoch_size: @@ -37,25 +41,38 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo else: images = Variable(torch.from_numpy(images).type(torch.FloatTensor)) targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets] + + #----------------------# + # 清零梯度 + #----------------------# optimizer.zero_grad() + #----------------------# + # 前向传播 + #----------------------# outputs = net(images) losses = [] + num_pos_all = 0 + #----------------------# + # 计算损失 + #----------------------# for i in range(3): - loss_item = yolo_losses[i](outputs[i], targets) - losses.append(loss_item[0]) - loss = sum(losses) + loss_item, num_pos = yolo_losses[i](outputs[i], targets) + losses.append(loss_item) + num_pos_all += num_pos + + loss = sum(losses) / num_pos + #----------------------# + # 反向传播 + #----------------------# loss.backward() optimizer.step() - total_loss += loss - waste_time = time.time() - start_time + total_loss += loss.item() - pbar.set_postfix(**{'total_loss': total_loss.item() / (iteration + 1), - 'lr' : get_lr(optimizer), - 'step/s' : waste_time}) + pbar.set_postfix(**{'total_loss': total_loss / (iteration + 1), + 'lr' : get_lr(optimizer)}) pbar.update(1) - start_time = time.time() net.eval() print('Start Validation') with tqdm(total=epoch_size_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: @@ -74,14 +91,15 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo optimizer.zero_grad() outputs = net(images_val) losses = [] + num_pos_all = 0 for i in range(3): - loss_item = yolo_losses[i](outputs[i], targets_val) - losses.append(loss_item[0]) - loss = sum(losses) - val_loss += loss - pbar.set_postfix(**{'total_loss': val_loss.item() / (iteration + 1)}) + loss_item, num_pos = yolo_losses[i](outputs[i], targets_val) + losses.append(loss_item) + num_pos_all += num_pos + loss = sum(losses) / num_pos + val_loss += loss.item() + pbar.set_postfix(**{'total_loss': val_loss / (iteration + 1)}) pbar.update(1) - net.train() print('Finish Validation') print('Epoch:'+ str(epoch+1) + '/' + str(Epoch)) print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) @@ -94,22 +112,33 @@ def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epo # https://www.bilibili.com/video/BV1zE411u7Vw #----------------------------------------------------# if __name__ == "__main__": - # 参数初始化 - annotation_path = '2007_train.txt' - model = YoloBody(Config) + #-------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #-------------------------------# Cuda = True #-------------------------------# # Dataloder的使用 #-------------------------------# Use_Data_Loader = True + #------------------------------------------------------# + # 是否对损失进行归一化 + #------------------------------------------------------# + normalize = True + #------------------------------------------------------# + # 创建yolo模型 + # 训练前一定要修改Config里面的classes参数 + #------------------------------------------------------# + model = YoloBody(Config) - #-------------------------------------------# - # 权值文件的下载请看README - #-------------------------------------------# + #------------------------------------------------------# + # 权值文件请看README,百度网盘下载 + #------------------------------------------------------# + model_path = "model_data/yolo_weights.pth" print('Loading weights into state dict...') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_dict = model.state_dict() - pretrained_dict = torch.load("model_data/yolo_weights.pth", map_location=device) + pretrained_dict = torch.load(model_path, map_location=device) pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)} model_dict.update(pretrained_dict) model.load_state_dict(model_dict) @@ -126,9 +155,17 @@ if __name__ == "__main__": yolo_losses = [] for i in range(3): yolo_losses.append(YOLOLoss(np.reshape(Config["yolo"]["anchors"],[-1,2]), - Config["yolo"]["classes"], (Config["img_w"], Config["img_h"]), Cuda)) + Config["yolo"]["classes"], (Config["img_w"], Config["img_h"]), Cuda, normalize)) - # 0.1用于验证,0.9用于训练 + #----------------------------------------------------# + # 获得图片路径和标签 + #----------------------------------------------------# + annotation_path = '2007_train.txt' + #----------------------------------------------------------------------# + # 验证集的划分在train.py代码里面进行 + # 2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。 + # 当前划分方式下,验证集和训练集的比例为1:9 + #----------------------------------------------------------------------# val_split = 0.1 with open(annotation_path) as f: lines = f.readlines() @@ -138,17 +175,15 @@ if __name__ == "__main__": num_val = int(len(lines)*val_split) num_train = len(lines) - num_val - #------------------------------------------------------# # 主干特征提取网络特征通用,冻结训练可以加快训练速度 # 也可以在训练初期防止权值被破坏。 # Init_Epoch为起始世代 # Freeze_Epoch为冻结训练的世代 - # Epoch总训练世代 + # Unfreeze_Epoch总训练世代 # 提示OOM或者显存不足请调小Batch_size #------------------------------------------------------# if True: - # 最开始使用1e-3的学习率可以收敛的更快 lr = 1e-3 Batch_size = 8 Init_Epoch = 0 @@ -158,17 +193,17 @@ if __name__ == "__main__": lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95) if Use_Data_Loader: - train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"])) - val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"])) + train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]), True) + val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]), False) gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) else: gen = Generator(Batch_size, lines[:num_train], - (Config["img_h"], Config["img_w"])).generate() + (Config["img_h"], Config["img_w"])).generate(True) gen_val = Generator(Batch_size, lines[num_train:], - (Config["img_h"], Config["img_w"])).generate() + (Config["img_h"], Config["img_w"])).generate(False) epoch_size = num_train//Batch_size epoch_size_val = num_val//Batch_size @@ -190,18 +225,19 @@ if __name__ == "__main__": optimizer = optim.Adam(net.parameters(),lr) lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95) + if Use_Data_Loader: - train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"])) - val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"])) + train_dataset = YoloDataset(lines[:num_train], (Config["img_h"], Config["img_w"]), True) + val_dataset = YoloDataset(lines[num_train:], (Config["img_h"], Config["img_w"]), False) gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True, drop_last=True, collate_fn=yolo_dataset_collate) else: gen = Generator(Batch_size, lines[:num_train], - (Config["img_h"], Config["img_w"])).generate() + (Config["img_h"], Config["img_w"])).generate(True) gen_val = Generator(Batch_size, lines[num_train:], - (Config["img_h"], Config["img_w"])).generate() + (Config["img_h"], Config["img_w"])).generate(False) epoch_size = num_train//Batch_size epoch_size_val = num_val//Batch_size diff --git a/utils/config.py b/utils/config.py index 41656d1e3e7c81821c0c20ac7dd13bfdb125e0a6..460beca4c37eb93ae8189629a87d5712bed52cd1 100644 --- a/utils/config.py +++ b/utils/config.py @@ -1,11 +1,19 @@ Config = \ -{ +{ + #-------------------------------------------------------------# + # 训练前一定要修改classes参数 + # anchors可以不修改,因为anchors的通用性较大 + # 而且大中小的设置非常符合yolo的特征层情况 + #-------------------------------------------------------------# "yolo": { "anchors": [[[116, 90], [156, 198], [373, 326]], [[30, 61], [62, 45], [59, 119]], [[10, 13], [16, 30], [33, 23]]], "classes": 20, }, + #-------------------------------------------------------------# + # img_h和img_w可以修改成608x608 + #-------------------------------------------------------------# "img_h": 416, "img_w": 416, } diff --git a/utils/dataloader.py b/utils/dataloader.py index 093c951bc7b44dcb93f13d52226eb1d1ef0ab2c6..398a47990bd4c44bc16e4927d2dbe5b10d52c94f 100644 --- a/utils/dataloader.py +++ b/utils/dataloader.py @@ -13,12 +13,13 @@ from nets.yolo_training import Generator import cv2 class YoloDataset(Dataset): - def __init__(self, train_lines, image_size): + def __init__(self, train_lines, image_size, is_train): super(YoloDataset, self).__init__() self.train_lines = train_lines self.train_batches = len(train_lines) self.image_size = image_size + self.is_train = is_train def __len__(self): return self.train_batches @@ -26,7 +27,7 @@ class YoloDataset(Dataset): def rand(self, a=0, b=1): return np.random.rand() * (b - a) + a - def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5): + def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True): """实时数据增强的随机预处理""" line = annotation_line.split() image = Image.open(line[0]) @@ -34,6 +35,35 @@ class YoloDataset(Dataset): h, w = input_shape box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]]) + if not random: + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + dx = (w-nw)//2 + dy = (h-nh)//2 + + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image_data = np.array(new_image, np.float32) + + # 调整目标框坐标 + box_data = np.zeros((len(box), 5)) + if len(box) > 0: + np.random.shuffle(box) + box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx + box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy + box[:, 0:2][box[:, 0:2] < 0] = 0 + box[:, 2][box[:, 2] > w] = w + box[:, 3][box[:, 3] > h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框 + box_data = np.zeros((len(box), 5)) + box_data[:len(box)] = box + + return image_data, box_data + # 调整图片大小 new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter) scale = self.rand(.25, 2) @@ -48,8 +78,7 @@ class YoloDataset(Dataset): # 放置图片 dx = int(self.rand(0, w - nw)) dy = int(self.rand(0, h - nh)) - new_image = Image.new('RGB', (w, h), - (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))) + new_image = Image.new('RGB', (w, h), (128, 128, 128)) new_image.paste(image, (dx, dy)) image = new_image @@ -89,19 +118,18 @@ class YoloDataset(Dataset): box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框 box_data = np.zeros((len(box), 5)) box_data[:len(box)] = box - if len(box) == 0: - return image_data, [] - - if (box_data[:, :4] > 0).any(): - return image_data, box_data - else: - return image_data, [] + + return image_data, box_data def __getitem__(self, index): lines = self.train_lines n = self.train_batches index = index % n - img, y = self.get_random_data(lines[index], self.image_size[0:2]) + if self.is_train: + img, y = self.get_random_data(lines[index], self.image_size[0:2]) + else: + img, y = self.get_random_data(lines[index], self.image_size[0:2], False) + if len(y) != 0: # 从坐标转换成0~1的百分比 boxes = np.array(y[:, :4], dtype=np.float32) diff --git a/utils/utils.py b/utils/utils.py index e67dcc3f89a6c47707a2e27e6c00a0fee1f59d90..2055a8fb0316a7856bbe8c9f4e852dabf1a580aa 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,18 +1,26 @@ from __future__ import division -import os + import math +import os import time + +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -import numpy as np +from PIL import Image, ImageDraw, ImageFont from torch.autograd import Variable from torchvision.ops import nms -from PIL import Image, ImageDraw, ImageFont + class DecodeBox(nn.Module): def __init__(self, anchors, num_classes, img_size): super(DecodeBox, self).__init__() + #-----------------------------------------------------------# + # 13x13的特征层对应的anchor是[116,90],[156,198],[373,326] + # 26x26的特征层对应的anchor是[30,61],[62,45],[59,119] + # 52x52的特征层对应的anchor是[10,13],[16,30],[33,23] + #-----------------------------------------------------------# self.anchors = anchors self.num_anchors = len(anchors) self.num_classes = num_classes @@ -20,17 +28,33 @@ class DecodeBox(nn.Module): self.img_size = img_size def forward(self, input): + #-----------------------------------------------# + # 输入的input一共有三个,他们的shape分别是 + # batch_size, 255, 13, 13 + # batch_size, 255, 26, 26 + # batch_size, 255, 52, 52 + #-----------------------------------------------# batch_size = input.size(0) input_height = input.size(2) input_width = input.size(3) - # 计算步长 + #-----------------------------------------------# + # 输入为416x416时 + # stride_h = stride_w = 32、16、8 + #-----------------------------------------------# stride_h = self.img_size[1] / input_height stride_w = self.img_size[0] / input_width - # 归一到特征层上 + #-------------------------------------------------# + # 此时获得的scaled_anchors大小是相对于特征层的 + #-------------------------------------------------# scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors] - # 对预测结果进行resize + #-----------------------------------------------# + # 输入的input一共有三个,他们的shape分别是 + # batch_size, 3, 13, 13, 85 + # batch_size, 3, 26, 26, 85 + # batch_size, 3, 52, 52, 85 + #-----------------------------------------------# prediction = input.view(batch_size, self.num_anchors, self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous() @@ -38,37 +62,48 @@ class DecodeBox(nn.Module): x = torch.sigmoid(prediction[..., 0]) y = torch.sigmoid(prediction[..., 1]) # 先验框的宽高调整参数 - w = prediction[..., 2] # Width - h = prediction[..., 3] # Height - + w = prediction[..., 2] + h = prediction[..., 3] # 获得置信度,是否有物体 conf = torch.sigmoid(prediction[..., 4]) # 种类置信度 - pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + pred_cls = torch.sigmoid(prediction[..., 5:]) FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor - # 生成网格,先验框中心,网格左上角 batch_size,3,13,13 + #----------------------------------------------------------# + # 生成网格,先验框中心,网格左上角 + # batch_size,3,13,13 + #----------------------------------------------------------# grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat( batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat( batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) - # 生成先验框的宽高 + #----------------------------------------------------------# + # 按照网格格式生成先验框的宽高 + # batch_size,3,13,13 + #----------------------------------------------------------# anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape) anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape) - # 计算调整后的先验框中心与宽高 + #----------------------------------------------------------# + # 利用预测结果对先验框进行调整 + # 首先调整先验框的中心,从先验框中心向右下角偏移 + # 再调整先验框的宽高。 + #----------------------------------------------------------# pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + grid_x pred_boxes[..., 1] = y.data + grid_y pred_boxes[..., 2] = torch.exp(w.data) * anchor_w pred_boxes[..., 3] = torch.exp(h.data) * anchor_h - # 用于将输出调整为相对于416x416的大小 + #----------------------------------------------------------# + # 将输出结果调整成相对于输入图像大小 + #----------------------------------------------------------# _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor) output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale, conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1) @@ -139,7 +174,10 @@ def bbox_iou(box1, box2, x1y1x2y2=True): def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): - # 求左上角和右下角 + #----------------------------------------------------------# + # 将预测结果的格式转换成左上角右下角的格式。 + # prediction [batch_size, num_anchors, 85] + #----------------------------------------------------------# box_corner = prediction.new(prediction.shape) box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 @@ -149,21 +187,35 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): output = [None for _ in range(len(prediction))] for image_i, image_pred in enumerate(prediction): - # 获得种类及其置信度 + #----------------------------------------------------------# + # 对种类预测部分取max。 + # class_conf [batch_size, num_anchors, 1] 种类置信度 + # class_pred [batch_size, num_anchors, 1] 种类 + #----------------------------------------------------------# class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) - # 利用置信度进行第一轮筛选 - conf_mask = (image_pred[:, 4]*class_conf[:, 0] >= conf_thres).squeeze() + #----------------------------------------------------------# + # 利用置信度进行第一轮筛选 + #----------------------------------------------------------# + conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze() + #----------------------------------------------------------# + # 根据置信度进行预测结果的筛选 + #----------------------------------------------------------# image_pred = image_pred[conf_mask] class_conf = class_conf[conf_mask] class_pred = class_pred[conf_mask] if not image_pred.size(0): continue - # 获得的内容为(x1, y1, x2, y2, obj_conf, class_conf, class_pred) + #-------------------------------------------------------------------------# + # detections [batch_size, num_anchors, 7] + # 7的内容为:x1, y1, x2, y2, obj_conf, class_conf, class_pred + #-------------------------------------------------------------------------# detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1) - # 获得种类 + #------------------------------------------# + # 获得预测结果中包含的所有种类 + #------------------------------------------# unique_labels = detections[:, -1].cpu().unique() if prediction.is_cuda: @@ -171,7 +223,9 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): detections = detections.cuda() for c in unique_labels: - # 获得某一类初步筛选后全部的预测结果 + #------------------------------------------# + # 获得某一类得分筛选后全部的预测结果 + #------------------------------------------# detections_class = detections[detections[:, -1] == c] #------------------------------------------# @@ -179,7 +233,7 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): #------------------------------------------# keep = nms( detections_class[:, :4], - detections_class[:, 4]*detections_class[:, 5], + detections_class[:, 4] * detections_class[:, 5], nms_thres ) max_detections = detections_class[keep] diff --git a/video.py b/video.py index 3c21ddcc860f4ced365637c1b7a9a4915a159d98..76cb910d5a470a9178e441fd6da3885b39218e17 100644 --- a/video.py +++ b/video.py @@ -1,15 +1,23 @@ #-------------------------------------# -# 调用摄像头检测 +# 调用摄像头或者视频进行检测 +# 调用摄像头直接运行即可 +# 调用视频可以将cv2.VideoCapture()指定路径 +# 视频的保存并不难,可以百度一下看看 #-------------------------------------# -from yolo import YOLO -from PIL import Image -import numpy as np -import cv2 import time -yolo = YOLO() -# 调用摄像头 -capture=cv2.VideoCapture(0) # capture=cv2.VideoCapture("1.mp4") +import cv2 +import numpy as np +from PIL import Image + +from yolo import YOLO + +yolo = YOLO() +#-------------------------------------# +# 调用摄像头 +# capture=cv2.VideoCapture("1.mp4") +#-------------------------------------# +capture=cv2.VideoCapture(0) fps = 0.0 while(True): t1 = time.time() @@ -19,10 +27,8 @@ while(True): frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) # 转变成Image frame = Image.fromarray(np.uint8(frame)) - # 进行检测 frame = np.array(yolo.detect_image(frame)) - # RGBtoBGR满足opencv显示格式 frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR) @@ -32,7 +38,6 @@ while(True): cv2.imshow("video",frame) - c= cv2.waitKey(1) & 0xff if c==27: capture.release() diff --git a/voc_annotation.py b/voc_annotation.py index aad06573cb4ebb7df461256759d793fe2ad44827..ca0f88d364b7792396625da86c09a928c01158f0 100644 --- a/voc_annotation.py +++ b/voc_annotation.py @@ -1,3 +1,8 @@ +#---------------------------------------------# +# 运行前一定要修改classes +# 如果生成的2007_train.txt里面没有目标信息 +# 那么就是因为classes没有设定正确 +#---------------------------------------------# import xml.etree.ElementTree as ET from os import getcwd diff --git a/yolo.py b/yolo.py index 06b396586cfa7e92df1908352f6d29579df96240..d80840a38a6dbfa36e93687e7e9bc4dce6b71904 100644 --- a/yolo.py +++ b/yolo.py @@ -1,22 +1,28 @@ #-------------------------------------# # 创建YOLO类 #-------------------------------------# -import cv2 -import numpy as np import colorsys import os + +import cv2 +import numpy as np import torch -import torch.nn as nn -from nets.yolo3 import YoloBody import torch.backends.cudnn as cudnn -from PIL import Image,ImageFont, ImageDraw +import torch.nn as nn +from PIL import Image, ImageDraw, ImageFont from torch.autograd import Variable + +from nets.yolo3 import YoloBody from utils.config import Config -from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes +from utils.utils import (DecodeBox, bbox_iou, letterbox_image, + non_max_suppression, yolo_correct_boxes) + #--------------------------------------------# # 使用自己训练好的模型预测需要修改2个参数 # model_path和classes_path都需要修改! +# 如果出现shape不匹配,一定要注意 +# 训练时的model_path和classes_path参数的修改 #--------------------------------------------# class YOLO(object): _defaults = { @@ -52,14 +58,20 @@ class YOLO(object): class_names = f.readlines() class_names = [c.strip() for c in class_names] return class_names + #---------------------------------------------------# - # 获得所有的分类 + # 生成模型 #---------------------------------------------------# def generate(self): self.config["yolo"]["classes"] = len(self.class_names) + #---------------------------------------------------# + # 建立yolov3模型 + #---------------------------------------------------# self.net = YoloBody(self.config) - # 加快模型训练的效率 + #---------------------------------------------------# + # 载入yolov3模型的权重 + #---------------------------------------------------# print('Loading weights into state dict...') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') state_dict = torch.load(self.model_path, map_location=device) @@ -71,10 +83,12 @@ class YOLO(object): self.net = nn.DataParallel(self.net) self.net = self.net.cuda() + #---------------------------------------------------# + # 建立三个特征层解码用的工具 + #---------------------------------------------------# self.yolo_decodes = [] for i in range(3): - self.yolo_decodes.append(DecodeBox(self.config["yolo"]["anchors"][i], self.config["yolo"]["classes"], (self.model_image_size[1], self.model_image_size[0]))) - + self.yolo_decodes.append(DecodeBox(self.config["yolo"]["anchors"][i], self.config["yolo"]["classes"], (self.model_image_size[1], self.model_image_size[0]))) print('{} model, anchors, and classes loaded.'.format(self.model_path)) # 画框设置不同的颜色 @@ -91,44 +105,65 @@ class YOLO(object): def detect_image(self, image): image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + #---------------------------------------------------------# crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - photo = np.array(crop_img,dtype = np.float32) - photo /= 255.0 + photo = np.array(crop_img,dtype = np.float32) / 255.0 photo = np.transpose(photo, (2, 0, 1)) - photo = photo.astype(np.float32) - images = [] - images.append(photo) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + images = [photo] - images = np.asarray(images) - images = torch.from_numpy(images) - if self.cuda: - images = images.cuda() - with torch.no_grad(): + images = torch.from_numpy(np.asarray(images)) + if self.cuda: + images = images.cuda() + + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# outputs = self.net(images) output_list = [] for i in range(3): output_list.append(self.yolo_decodes[i](outputs[i])) + + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# output = torch.cat(output_list, 1) batch_detections = non_max_suppression(output, self.config["yolo"]["classes"], conf_thres=self.confidence, nms_thres=self.iou) - try : - batch_detections = batch_detections[0].cpu().numpy() - except: - return image - top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence - top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] - top_label = np.array(batch_detections[top_index,-1],np.int32) - top_bboxes = np.array(batch_detections[top_index,:4]) - top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) - - # 去掉灰条 - boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) + + #---------------------------------------------------------# + # 如果没有检测出物体,返回原图 + #---------------------------------------------------------# + try : + batch_detections = batch_detections[0].cpu().numpy() + except: + return image + + #---------------------------------------------------------# + # 对预测框进行得分筛选 + #---------------------------------------------------------# + top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence + top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] + top_label = np.array(batch_detections[top_index,-1],np.int32) + top_bboxes = np.array(batch_detections[top_index,:4]) + top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) + + #-----------------------------------------------------------------# + # 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条 + # 因此生成的top_bboxes是相对于有灰条的图像的 + # 我们需要对其进行修改,去除灰条的部分。 + #-----------------------------------------------------------------# + boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32')) - thickness = (np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0] + thickness = max((np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0], 1) for i, c in enumerate(top_label): predicted_class = self.class_names[c] @@ -150,7 +185,7 @@ class YOLO(object): draw = ImageDraw.Draw(image) label_size = draw.textsize(label, font) label = label.encode('utf-8') - print(label) + print(label, top, left, bottom, right) if top - label_size[1] >= 0: text_origin = np.array([left, top - label_size[1]])