diff --git a/VOCdevkit/VOC2007/Annotations/README.md b/VOCdevkit/VOC2007/Annotations/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d73a1916c99aecd41ce3fbebad1da573bc3c5845 --- /dev/null +++ b/VOCdevkit/VOC2007/Annotations/README.md @@ -0,0 +1 @@ +存放标签文件 \ No newline at end of file diff --git a/VOCdevkit/VOC2007/ImageSets/Main/README.md b/VOCdevkit/VOC2007/ImageSets/Main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2f64921067993322973aba9260091c48c26aa43f --- /dev/null +++ b/VOCdevkit/VOC2007/ImageSets/Main/README.md @@ -0,0 +1 @@ +存放训练索引文件 \ No newline at end of file diff --git a/VOCdevkit/VOC2007/JPEGImages/README.md b/VOCdevkit/VOC2007/JPEGImages/README.md new file mode 100644 index 0000000000000000000000000000000000000000..58f32ea1aa6b31fcbbeba5b862e31bfff5ae527c --- /dev/null +++ b/VOCdevkit/VOC2007/JPEGImages/README.md @@ -0,0 +1 @@ +存放图片文件 \ No newline at end of file diff --git a/VOCdevkit/VOC2007/voc2yolo4.py b/VOCdevkit/VOC2007/voc2yolo4.py new file mode 100644 index 0000000000000000000000000000000000000000..02458b71c6d285366ce5caafd5c25a3ec55fdf91 --- /dev/null +++ b/VOCdevkit/VOC2007/voc2yolo4.py @@ -0,0 +1,44 @@ +import os +import random + +xmlfilepath=r'./VOCdevkit/VOC2007/Annotations' +saveBasePath=r"./VOCdevkit/VOC2007/ImageSets/Main/" + +trainval_percent=0 +train_percent=1 + +temp_xml = os.listdir(xmlfilepath) +total_xml = [] +for xml in temp_xml: + if xml.endswith(".xml"): + total_xml.append(xml) + +num=len(total_xml) +list=range(num) +tv=int(num*trainval_percent) +tr=int(tv*train_percent) +trainval= random.sample(list,tv) +train=random.sample(trainval,tr) + +print("train and val size",tv) +print("traub suze",tr) +ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w') +ftest = open(os.path.join(saveBasePath,'test.txt'), 'w') +ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w') +fval = open(os.path.join(saveBasePath,'val.txt'), 'w') + +for i in list: + name=total_xml[i][:-4]+'\n' + if i in trainval: + ftrainval.write(name) + if i in train: + ftrain.write(name) + else: + fval.write(name) + else: + ftest.write(name) + +ftrainval.close() +ftrain.close() +fval.close() +ftest .close() diff --git a/ciou_test.py b/ciou_test.py new file mode 100644 index 0000000000000000000000000000000000000000..20ad2228964050a53f46fa99a4f43706d61fd2a6 --- /dev/null +++ b/ciou_test.py @@ -0,0 +1,56 @@ +import torch +import math +import numpy as np +def box_ciou(b1, b2): + """ + 输入为: + ---------- + b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + + 返回为: + ------- + ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1) + """ + # 求出预测框左上角右下角 + b1_xy = b1[..., :2] + b1_wh = b1[..., 2:4] + b1_wh_half = b1_wh/2. + b1_mins = b1_xy - b1_wh_half + b1_maxes = b1_xy + b1_wh_half + # 求出真实框左上角右下角 + b2_xy = b2[..., :2] + b2_wh = b2[..., 2:4] + b2_wh_half = b2_wh/2. + b2_mins = b2_xy - b2_wh_half + b2_maxes = b2_xy + b2_wh_half + + # 求真实框和预测框所有的iou + intersect_mins = torch.max(b1_mins, b2_mins) + intersect_maxes = torch.min(b1_maxes, b2_maxes) + intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes)) + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + b1_area = b1_wh[..., 0] * b1_wh[..., 1] + b2_area = b2_wh[..., 0] * b2_wh[..., 1] + union_area = b1_area + b2_area - intersect_area + iou = intersect_area / (union_area + 1e-7) + + # 计算中心的差距 + center_distance = torch.sum(torch.pow((b1_xy - b2_xy), 2), axis=-1) + # 找到包裹两个框的最小框的左上角和右下角 + enclose_mins = torch.min(b1_mins, b2_mins) + enclose_maxes = torch.max(b1_maxes, b2_maxes) + enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes)) + # 计算对角线距离 + enclose_diagonal = torch.sum(torch.pow(enclose_wh,2), axis=-1) + ciou = iou - 1.0 * (center_distance) / (enclose_diagonal + 1e-7) + + v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0]/b1_wh[..., 1]) - torch.atan(b2_wh[..., 0]/b2_wh[..., 1])), 2) + alpha = v / (1.0 - iou + v) + ciou = ciou - alpha * v + return ciou + +box1 = torch.from_numpy(np.array([[25,25,40,40]])).type(torch.FloatTensor) +box2 = torch.from_numpy(np.array([[25,25,30,40]])).type(torch.FloatTensor) + +print(box_ciou(box1,box2)) \ No newline at end of file diff --git a/get_dr_txt.py b/get_dr_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..b1136b0db428d1128217ea2aa599d35628f16a8a --- /dev/null +++ b/get_dr_txt.py @@ -0,0 +1,97 @@ +#-------------------------------------# +# mAP所需文件计算代码 +# 具体教程请查看Bilibili +# Bubbliiiing +#-------------------------------------# +import cv2 +import keras +import numpy as np +import colorsys +import os +import torch +import torch.nn as nn +import torch.backends.cudnn as cudnn +from torch.autograd import Variable +from yolo import YOLO +from nets.yolo4 import YoloBody +from PIL import Image,ImageFont, ImageDraw +from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes + +class mAP_Yolo(YOLO): + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self,image_id,image): + self.confidence = 0.05 + f = open("./input/detection-results/"+image_id+".txt","w") + image_shape = np.array(np.shape(image)[0:2]) + + crop_img = np.array(letterbox_image(image, (self.model_image_size[0],self.model_image_size[1]))) + photo = np.array(crop_img,dtype = np.float32) + photo /= 255.0 + photo = np.transpose(photo, (2, 0, 1)) + photo = photo.astype(np.float32) + images = [] + images.append(photo) + images = np.asarray(images) + + with torch.no_grad(): + images = torch.from_numpy(images) + if self.cuda: + images = images.cuda() + + + outputs = self.net(images) + output_list = [] + for i in range(3): + output_list.append(self.yolo_decodes[i](outputs[i])) + output = torch.cat(output_list, 1) + batch_detections = non_max_suppression(output, len(self.class_names), + conf_thres=self.confidence, + nms_thres=0.3) + + try: + batch_detections = batch_detections[0].cpu().numpy() + except: + return image + + top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence + top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] + top_label = np.array(batch_detections[top_index,-1],np.int32) + top_bboxes = np.array(batch_detections[top_index,:4]) + top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) + + # 去掉灰条 + boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) + + for i, c in enumerate(top_label): + predicted_class = self.class_names[c] + score = str(top_conf[i]) + + top, left, bottom, right = boxes[i] + f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom)))) + + f.close() + return + +yolo = mAP_Yolo() +image_ids = open('VOCdevkit/VOC2007/ImageSets/Main/test.txt').read().strip().split() + +if not os.path.exists("./input"): + os.makedirs("./input") +if not os.path.exists("./input/detection-results"): + os.makedirs("./input/detection-results") +if not os.path.exists("./input/images-optional"): + os.makedirs("./input/images-optional") + + +for image_id in image_ids: + image_path = "./VOCdevkit/VOC2007/JPEGImages/"+image_id+".jpg" + image = Image.open(image_path) + # 开启后在之后计算mAP可以可视化 + # image.save("./input/images-optional/"+image_id+".jpg") + yolo.detect_image(image_id,image) + print(image_id," done!") + + +print("Conversion completed!") \ No newline at end of file diff --git a/get_gt_txt.py b/get_gt_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd1ddb47b67050704cda0eb0a24f9be8bd67f37 --- /dev/null +++ b/get_gt_txt.py @@ -0,0 +1,33 @@ +#-------------------------------------# +# mAP所需文件计算代码 +# 具体教程请查看Bilibili +# Bubbliiiing +#-------------------------------------# +import sys +import os +import glob +import xml.etree.ElementTree as ET + +image_ids = open('VOCdevkit/VOC2007/ImageSets/Main/test.txt').read().strip().split() + +if not os.path.exists("./input"): + os.makedirs("./input") +if not os.path.exists("./input/ground-truth"): + os.makedirs("./input/ground-truth") + +for image_id in image_ids: + with open("./input/ground-truth/"+image_id+".txt", "w") as new_f: + root = ET.parse("VOCdevkit/VOC2007/Annotations/"+image_id+".xml").getroot() + for obj in root.findall('object'): + if obj.find('difficult')!=None: + difficult = obj.find('difficult').text + if int(difficult)==1: + continue + obj_name = obj.find('name').text + bndbox = obj.find('bndbox') + left = bndbox.find('xmin').text + top = bndbox.find('ymin').text + right = bndbox.find('xmax').text + bottom = bndbox.find('ymax').text + new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom)) +print("Conversion completed!") \ No newline at end of file diff --git a/get_map.py b/get_map.py new file mode 100644 index 0000000000000000000000000000000000000000..2cd70c81d7a5ffc8f15f91785b1f986e0483bdb0 --- /dev/null +++ b/get_map.py @@ -0,0 +1,880 @@ +import glob +import json +import os +import shutil +import operator +import sys +import argparse +import math + +import numpy as np +#----------------------------------------------------# +# 用于计算mAP +# 代码克隆自https://github.com/Cartucho/mAP +#----------------------------------------------------# +MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) + +parser = argparse.ArgumentParser() +parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true") +parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true") +parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true") +# argparse receiving list of classes to be ignored +parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.") +# argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7) +parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.") +args = parser.parse_args() + +''' + 0,0 ------> x (width) + | + | (Left,Top) + | *_________ + | | | + | | + y |_________| + (height) * + (Right,Bottom) +''' + +# if there are no classes to ignore then replace None by empty list +if args.ignore is None: + args.ignore = [] + +specific_iou_flagged = False +if args.set_class_iou is not None: + specific_iou_flagged = True + +# make sure that the cwd() is the location of the python script (so that every path makes sense) +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +GT_PATH = os.path.join(os.getcwd(), 'input', 'ground-truth') +DR_PATH = os.path.join(os.getcwd(), 'input', 'detection-results') +# if there are no images then no animation can be shown +IMG_PATH = os.path.join(os.getcwd(), 'input', 'images-optional') +if os.path.exists(IMG_PATH): + for dirpath, dirnames, files in os.walk(IMG_PATH): + if not files: + # no image files found + args.no_animation = True +else: + args.no_animation = True + +# try to import OpenCV if the user didn't choose the option --no-animation +show_animation = False +if not args.no_animation: + try: + import cv2 + show_animation = True + except ImportError: + print("\"opencv-python\" not found, please install to visualize the results.") + args.no_animation = True + +# try to import Matplotlib if the user didn't choose the option --no-plot +draw_plot = False +if not args.no_plot: + try: + import matplotlib.pyplot as plt + draw_plot = True + except ImportError: + print("\"matplotlib\" not found, please install it to get the resulting plots.") + args.no_plot = True + + +def log_average_miss_rate(precision, fp_cumsum, num_images): + """ + log-average miss rate: + Calculated by averaging miss rates at 9 evenly spaced FPPI points + between 10e-2 and 10e0, in log-space. + + output: + lamr | log-average miss rate + mr | miss rate + fppi | false positives per image + + references: + [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the + State of the Art." Pattern Analysis and Machine Intelligence, IEEE + Transactions on 34.4 (2012): 743 - 761. + """ + + # if there were no detections of that class + if precision.size == 0: + lamr = 0 + mr = 1 + fppi = 0 + return lamr, mr, fppi + + fppi = fp_cumsum / float(num_images) + mr = (1 - precision) + + fppi_tmp = np.insert(fppi, 0, -1.0) + mr_tmp = np.insert(mr, 0, 1.0) + + # Use 9 evenly spaced reference points in log-space + ref = np.logspace(-2.0, 0.0, num = 9) + for i, ref_i in enumerate(ref): + # np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0 + j = np.where(fppi_tmp <= ref_i)[-1][-1] + ref[i] = mr_tmp[j] + + # log(0) is undefined, so we use the np.maximum(1e-10, ref) + lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) + + return lamr, mr, fppi + +""" + throw error and exit +""" +def error(msg): + print(msg) + sys.exit(0) + +""" + check if the number is a float between 0.0 and 1.0 +""" +def is_float_between_0_and_1(value): + try: + val = float(value) + if val > 0.0 and val < 1.0: + return True + else: + return False + except ValueError: + return False + +""" + Calculate the AP given the recall and precision array + 1st) We compute a version of the measured precision/recall curve with + precision monotonically decreasing + 2nd) We compute the AP as the area under this curve by numerical integration. +""" +def voc_ap(rec, prec): + """ + --- Official matlab code VOC2012--- + mrec=[0 ; rec ; 1]; + mpre=[0 ; prec ; 0]; + for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + end + i=find(mrec(2:end)~=mrec(1:end-1))+1; + ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + rec.insert(0, 0.0) # insert 0.0 at begining of list + rec.append(1.0) # insert 1.0 at end of list + mrec = rec[:] + prec.insert(0, 0.0) # insert 0.0 at begining of list + prec.append(0.0) # insert 0.0 at end of list + mpre = prec[:] + """ + This part makes the precision monotonically decreasing + (goes from the end to the beginning) + matlab: for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + """ + # matlab indexes start in 1 but python in 0, so I have to do: + # range(start=(len(mpre) - 2), end=0, step=-1) + # also the python function range excludes the end, resulting in: + # range(start=(len(mpre) - 2), end=-1, step=-1) + for i in range(len(mpre)-2, -1, -1): + mpre[i] = max(mpre[i], mpre[i+1]) + """ + This part creates a list of indexes where the recall changes + matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; + """ + i_list = [] + for i in range(1, len(mrec)): + if mrec[i] != mrec[i-1]: + i_list.append(i) # if it was matlab would be i + 1 + """ + The Average Precision (AP) is the area under the curve + (numerical integration) + matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + ap = 0.0 + for i in i_list: + ap += ((mrec[i]-mrec[i-1])*mpre[i]) + return ap, mrec, mpre + + +""" + Convert the lines of a file to a list +""" +def file_lines_to_list(path): + # open txt file lines to a list + with open(path) as f: + content = f.readlines() + # remove whitespace characters like `\n` at the end of each line + content = [x.strip() for x in content] + return content + +""" + Draws text in image +""" +def draw_text_in_image(img, text, pos, color, line_width): + font = cv2.FONT_HERSHEY_PLAIN + fontScale = 1 + lineType = 1 + bottomLeftCornerOfText = pos + cv2.putText(img, text, + bottomLeftCornerOfText, + font, + fontScale, + color, + lineType) + text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] + return img, (line_width + text_width) + +""" + Plot - adjust axes +""" +def adjust_axes(r, t, fig, axes): + # get text width for re-scaling + bb = t.get_window_extent(renderer=r) + text_width_inches = bb.width / fig.dpi + # get axis width in inches + current_fig_width = fig.get_figwidth() + new_fig_width = current_fig_width + text_width_inches + propotion = new_fig_width / current_fig_width + # get axis limit + x_lim = axes.get_xlim() + axes.set_xlim([x_lim[0], x_lim[1]*propotion]) + +""" + Draw plot using Matplotlib +""" +def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): + # sort the dictionary by decreasing value, into a list of tuples + sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) + # unpacking the list of tuples into two lists + sorted_keys, sorted_values = zip(*sorted_dic_by_value) + # + if true_p_bar != "": + """ + Special case to draw in: + - green -> TP: True Positives (object detected and matches ground-truth) + - red -> FP: False Positives (object detected but does not match ground-truth) + - orange -> FN: False Negatives (object not detected but present in the ground-truth) + """ + fp_sorted = [] + tp_sorted = [] + for key in sorted_keys: + fp_sorted.append(dictionary[key] - true_p_bar[key]) + tp_sorted.append(true_p_bar[key]) + plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') + plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) + # add legend + plt.legend(loc='lower right') + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + fp_val = fp_sorted[i] + tp_val = tp_sorted[i] + fp_str_val = " " + str(fp_val) + tp_str_val = fp_str_val + " " + str(tp_val) + # trick to paint multicolor with offset: + # first paint everything and then repaint the first number + t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') + plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + else: + plt.barh(range(n_classes), sorted_values, color=plot_color) + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + str_val = " " + str(val) # add a space before + if val < 1.0: + str_val = " {0:.2f}".format(val) + t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') + # re-set axes to show number inside the figure + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + # set window title + fig.canvas.set_window_title(window_title) + # write classes in y axis + tick_font_size = 12 + plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) + """ + Re-scale height accordingly + """ + init_height = fig.get_figheight() + # comput the matrix height in points and inches + dpi = fig.dpi + height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) + height_in = height_pt / dpi + # compute the required figure height + top_margin = 0.15 # in percentage of the figure height + bottom_margin = 0.05 # in percentage of the figure height + figure_height = height_in / (1 - top_margin - bottom_margin) + # set new height + if figure_height > init_height: + fig.set_figheight(figure_height) + + # set plot title + plt.title(plot_title, fontsize=14) + # set axis titles + # plt.xlabel('classes') + plt.xlabel(x_label, fontsize='large') + # adjust size of window + fig.tight_layout() + # save the plot + fig.savefig(output_path) + # show image + if to_show: + plt.show() + # close the plot + plt.close() + +""" + Create a ".temp_files/" and "results/" directory +""" +TEMP_FILES_PATH = ".temp_files" +if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already + os.makedirs(TEMP_FILES_PATH) +results_files_path = "results" +if os.path.exists(results_files_path): # if it exist already + # reset the results directory + shutil.rmtree(results_files_path) + +os.makedirs(results_files_path) +if draw_plot: + os.makedirs(os.path.join(results_files_path, "classes")) +if show_animation: + os.makedirs(os.path.join(results_files_path, "images", "detections_one_by_one")) + +""" + ground-truth + Load each of the ground-truth files into a temporary ".json" file. + Create a list of all the class names present in the ground-truth (gt_classes). +""" +# get a list with the ground-truth files +ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') +if len(ground_truth_files_list) == 0: + error("Error: No ground-truth files found!") +ground_truth_files_list.sort() +# dictionary with counter per class +gt_counter_per_class = {} +counter_images_per_class = {} + +for txt_file in ground_truth_files_list: + #print(txt_file) + file_id = txt_file.split(".txt", 1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + # check if there is a correspondent detection-results file + temp_path = os.path.join(DR_PATH, (file_id + ".txt")) + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" + error(error_msg) + lines_list = file_lines_to_list(txt_file) + # create ground-truth dictionary + bounding_boxes = [] + is_difficult = False + already_seen_classes = [] + for line in lines_list: + try: + if "difficult" in line: + class_name, left, top, right, bottom, _difficult = line.split() + is_difficult = True + else: + class_name, left, top, right, bottom = line.split() + except ValueError: + error_msg = "Error: File " + txt_file + " in the wrong format.\n" + error_msg += " Expected: ['difficult']\n" + error_msg += " Received: " + line + error_msg += "\n\nIf you have a with spaces between words you should remove them\n" + error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder." + error(error_msg) + # check if class is in the ignore list, if yes skip + if class_name in args.ignore: + continue + bbox = left + " " + top + " " + right + " " +bottom + if is_difficult: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) + is_difficult = False + else: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) + # count that object + if class_name in gt_counter_per_class: + gt_counter_per_class[class_name] += 1 + else: + # if class didn't exist yet + gt_counter_per_class[class_name] = 1 + + if class_name not in already_seen_classes: + if class_name in counter_images_per_class: + counter_images_per_class[class_name] += 1 + else: + # if class didn't exist yet + counter_images_per_class[class_name] = 1 + already_seen_classes.append(class_name) + + + # dump bounding_boxes into a ".json" file + with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + +gt_classes = list(gt_counter_per_class.keys()) +# let's sort the classes alphabetically +gt_classes = sorted(gt_classes) +n_classes = len(gt_classes) +#print(gt_classes) +#print(gt_counter_per_class) + +""" + Check format of the flag --set-class-iou (if used) + e.g. check if class exists +""" +if specific_iou_flagged: + n_args = len(args.set_class_iou) + error_msg = \ + '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]' + if n_args % 2 != 0: + error('Error, missing arguments. Flag usage:' + error_msg) + # [class_1] [IoU_1] [class_2] [IoU_2] + # specific_iou_classes = ['class_1', 'class_2'] + specific_iou_classes = args.set_class_iou[::2] # even + # iou_list = ['IoU_1', 'IoU_2'] + iou_list = args.set_class_iou[1::2] # odd + if len(specific_iou_classes) != len(iou_list): + error('Error, missing arguments. Flag usage:' + error_msg) + for tmp_class in specific_iou_classes: + if tmp_class not in gt_classes: + error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg) + for num in iou_list: + if not is_float_between_0_and_1(num): + error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg) + +""" + detection-results + Load each of the detection-results files into a temporary ".json" file. +""" +# get a list with the detection-results files +dr_files_list = glob.glob(DR_PATH + '/*.txt') +dr_files_list.sort() + +for class_index, class_name in enumerate(gt_classes): + bounding_boxes = [] + for txt_file in dr_files_list: + #print(txt_file) + # the first time it checks if all the corresponding ground-truth files exist + file_id = txt_file.split(".txt",1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(GT_PATH, (file_id + ".txt")) + if class_index == 0: + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" + error(error_msg) + lines = file_lines_to_list(txt_file) + for line in lines: + try: + tmp_class_name, confidence, left, top, right, bottom = line.split() + except ValueError: + error_msg = "Error: File " + txt_file + " in the wrong format.\n" + error_msg += " Expected: \n" + error_msg += " Received: " + line + error(error_msg) + if tmp_class_name == class_name: + #print("match") + bbox = left + " " + top + " " + right + " " +bottom + bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) + #print(bounding_boxes) + # sort detection-results by decreasing confidence + bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) + with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + +""" + Calculate the AP for each class +""" +sum_AP = 0.0 +ap_dictionary = {} +lamr_dictionary = {} +# open file to store the results +with open(results_files_path + "/results.txt", 'w') as results_file: + results_file.write("# AP and precision/recall per class\n") + count_true_positives = {} + for class_index, class_name in enumerate(gt_classes): + count_true_positives[class_name] = 0 + """ + Load detection-results of that class + """ + dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" + dr_data = json.load(open(dr_file)) + + """ + Assign detection-results to ground-truth objects + """ + nd = len(dr_data) + tp = [0] * nd # creates an array of zeros of size nd + fp = [0] * nd + for idx, detection in enumerate(dr_data): + file_id = detection["file_id"] + if show_animation: + # find ground truth image + ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") + #tifCounter = len(glob.glob1(myPath,"*.tif")) + if len(ground_truth_img) == 0: + error("Error. Image not found with id: " + file_id) + elif len(ground_truth_img) > 1: + error("Error. Multiple image with id: " + file_id) + else: # found image + #print(IMG_PATH + "/" + ground_truth_img[0]) + # Load image + img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) + # load image with draws of multiple detections + img_cumulative_path = results_files_path + "/images/" + ground_truth_img[0] + if os.path.isfile(img_cumulative_path): + img_cumulative = cv2.imread(img_cumulative_path) + else: + img_cumulative = img.copy() + # Add bottom border to image + bottom_border = 60 + BLACK = [0, 0, 0] + img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) + # assign detection-results to ground truth object if any + # open ground-truth with that file_id + gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" + ground_truth_data = json.load(open(gt_file)) + ovmax = -1 + gt_match = -1 + # load detected object bounding-box + bb = [ float(x) for x in detection["bbox"].split() ] + for obj in ground_truth_data: + # look for a class_name match + if obj["class_name"] == class_name: + bbgt = [ float(x) for x in obj["bbox"].split() ] + bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] + iw = bi[2] - bi[0] + 1 + ih = bi[3] - bi[1] + 1 + if iw > 0 and ih > 0: + # compute overlap (IoU) = area of intersection / area of union + ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] + + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih + ov = iw * ih / ua + if ov > ovmax: + ovmax = ov + gt_match = obj + + # assign detection as true positive/don't care/false positive + if show_animation: + status = "NO MATCH FOUND!" # status is only used in the animation + # set minimum overlap + min_overlap = MINOVERLAP + if specific_iou_flagged: + if class_name in specific_iou_classes: + index = specific_iou_classes.index(class_name) + min_overlap = float(iou_list[index]) + if ovmax >= min_overlap: + if "difficult" not in gt_match: + if not bool(gt_match["used"]): + # true positive + tp[idx] = 1 + gt_match["used"] = True + count_true_positives[class_name] += 1 + # update the ".json" file + with open(gt_file, 'w') as f: + f.write(json.dumps(ground_truth_data)) + if show_animation: + status = "MATCH!" + else: + # false positive (multiple detection) + fp[idx] = 1 + if show_animation: + status = "REPEATED MATCH!" + else: + # false positive + fp[idx] = 1 + if ovmax > 0: + status = "INSUFFICIENT OVERLAP" + + """ + Draw image to show animation + """ + if show_animation: + height, widht = img.shape[:2] + # colors (OpenCV works with BGR) + white = (255,255,255) + light_blue = (255,200,100) + green = (0,255,0) + light_red = (30,30,255) + # 1st line + margin = 10 + v_pos = int(height - margin - (bottom_border / 2.0)) + text = "Image: " + ground_truth_img[0] + " " + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) + if ovmax != -1: + color = light_red + if status == "INSUFFICIENT OVERLAP": + text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) + else: + text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) + color = green + img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + # 2nd line + v_pos += int(bottom_border / 2.0) + rank_pos = str(idx+1) # rank position (idx starts at 0) + text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + color = light_red + if status == "MATCH!": + color = green + text = "Result: " + status + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + + font = cv2.FONT_HERSHEY_SIMPLEX + if ovmax > 0: # if there is intersections between the bounding-boxes + bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] + cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) + bb = [int(i) for i in bb] + cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) + # show image + cv2.imshow("Animation", img) + cv2.waitKey(20) # show for 20 ms + # save image to results + output_img_path = results_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" + cv2.imwrite(output_img_path, img) + # save the image with all the objects drawn to it + cv2.imwrite(img_cumulative_path, img_cumulative) + + #print(tp) + # compute precision/recall + cumsum = 0 + for idx, val in enumerate(fp): + fp[idx] += cumsum + cumsum += val + cumsum = 0 + for idx, val in enumerate(tp): + tp[idx] += cumsum + cumsum += val + #print(tp) + rec = tp[:] + for idx, val in enumerate(tp): + rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] + #print(rec) + prec = tp[:] + for idx, val in enumerate(tp): + prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) + #print(prec) + + ap, mrec, mprec = voc_ap(rec[:], prec[:]) + sum_AP += ap + text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) + """ + Write to results.txt + """ + rounded_prec = [ '%.2f' % elem for elem in prec ] + rounded_rec = [ '%.2f' % elem for elem in rec ] + results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") + if not args.quiet: + print(text) + ap_dictionary[class_name] = ap + + n_images = counter_images_per_class[class_name] + lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images) + lamr_dictionary[class_name] = lamr + + """ + Draw plot + """ + if draw_plot: + plt.plot(rec, prec, '-o') + # add a new penultimate point to the list (mrec[-2], 0.0) + # since the last line segment (and respective area) do not affect the AP value + area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] + area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] + plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') + # set window title + fig = plt.gcf() # gcf - get current figure + fig.canvas.set_window_title('AP ' + class_name) + # set plot title + plt.title('class: ' + text) + #plt.suptitle('This is a somewhat long figure title', fontsize=16) + # set axis titles + plt.xlabel('Recall') + plt.ylabel('Precision') + # optional - set axes + axes = plt.gca() # gca - get current axes + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) # .05 to give some extra space + # Alternative option -> wait for button to be pressed + #while not plt.waitforbuttonpress(): pass # wait for key display + # Alternative option -> normal display + #plt.show() + # save the plot + fig.savefig(results_files_path + "/classes/" + class_name + ".png") + plt.cla() # clear axes for next plot + + if show_animation: + cv2.destroyAllWindows() + + results_file.write("\n# mAP of all classes\n") + mAP = sum_AP / n_classes + text = "mAP = {0:.2f}%".format(mAP*100) + results_file.write(text + "\n") + print(text) + +# remove the temp_files directory +shutil.rmtree(TEMP_FILES_PATH) + +""" + Count total of detection-results +""" +# iterate through all the files +det_counter_per_class = {} +for txt_file in dr_files_list: + # get lines to list + lines_list = file_lines_to_list(txt_file) + for line in lines_list: + class_name = line.split()[0] + # check if class is in the ignore list, if yes skip + if class_name in args.ignore: + continue + # count that object + if class_name in det_counter_per_class: + det_counter_per_class[class_name] += 1 + else: + # if class didn't exist yet + det_counter_per_class[class_name] = 1 +#print(det_counter_per_class) +dr_classes = list(det_counter_per_class.keys()) + + +""" + Plot the total number of occurences of each class in the ground-truth +""" +if draw_plot: + window_title = "ground-truth-info" + plot_title = "ground-truth\n" + plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" + x_label = "Number of objects per class" + output_path = results_files_path + "/ground-truth-info.png" + to_show = False + plot_color = 'forestgreen' + draw_plot_func( + gt_counter_per_class, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + '', + ) + +""" + Write number of ground-truth objects per class to results.txt +""" +with open(results_files_path + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of ground-truth objects per class\n") + for class_name in sorted(gt_counter_per_class): + results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") + +""" + Finish counting true positives +""" +for class_name in dr_classes: + # if class exists in detection-result but not in ground-truth then there are no true positives in that class + if class_name not in gt_classes: + count_true_positives[class_name] = 0 +#print(count_true_positives) + +""" + Plot the total number of occurences of each class in the "detection-results" folder +""" +if draw_plot: + window_title = "detection-results-info" + # Plot title + plot_title = "detection-results\n" + plot_title += "(" + str(len(dr_files_list)) + " files and " + count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) + plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" + # end Plot title + x_label = "Number of objects per class" + output_path = results_files_path + "/detection-results-info.png" + to_show = False + plot_color = 'forestgreen' + true_p_bar = count_true_positives + draw_plot_func( + det_counter_per_class, + len(det_counter_per_class), + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + true_p_bar + ) + +""" + Write number of detected objects per class to results.txt +""" +with open(results_files_path + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of detected objects per class\n") + for class_name in sorted(dr_classes): + n_det = det_counter_per_class[class_name] + text = class_name + ": " + str(n_det) + text += " (tp:" + str(count_true_positives[class_name]) + "" + text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" + results_file.write(text) + +""" + Draw log-average miss rate plot (Show lamr of all classes in decreasing order) +""" +if draw_plot: + window_title = "lamr" + plot_title = "log-average miss rate" + x_label = "log-average miss rate" + output_path = results_files_path + "/lamr.png" + to_show = False + plot_color = 'royalblue' + draw_plot_func( + lamr_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + +""" + Draw mAP plot (Show AP's of all classes in decreasing order) +""" +if draw_plot: + window_title = "mAP" + plot_title = "mAP = {0:.2f}%".format(mAP*100) + x_label = "Average Precision" + output_path = results_files_path + "/mAP.png" + to_show = True + plot_color = 'royalblue' + draw_plot_func( + ap_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) diff --git a/img/street.jpg b/img/street.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6750d3724956500e32d5bc4a918a57db7df30100 Binary files /dev/null and b/img/street.jpg differ diff --git a/logs/README.md b/logs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..984825a33fed43b06f1d8cd1f38bb73699e2ff47 --- /dev/null +++ b/logs/README.md @@ -0,0 +1 @@ +用于存放训练好的文件 \ No newline at end of file diff --git a/model_data/coco_classes.txt b/model_data/coco_classes.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ec6eeea2f217dca9788d8a4f9ab032dd05e6beb --- /dev/null +++ b/model_data/coco_classes.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/model_data/simhei.ttf b/model_data/simhei.ttf new file mode 100644 index 0000000000000000000000000000000000000000..5bd4687e7212775e23bea569f08fdd1cd7395dc3 Binary files /dev/null and b/model_data/simhei.ttf differ diff --git a/model_data/voc_classes.txt b/model_data/voc_classes.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c1b622c19478d847285c7fcde40bdbc6355b90 --- /dev/null +++ b/model_data/voc_classes.txt @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor \ No newline at end of file diff --git a/model_data/yolo_anchors.txt b/model_data/yolo_anchors.txt new file mode 100644 index 0000000000000000000000000000000000000000..396f07e146622a8450bd3b7719d43390203eb089 --- /dev/null +++ b/model_data/yolo_anchors.txt @@ -0,0 +1 @@ +12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 \ No newline at end of file diff --git a/nets/CSPdarknet.py b/nets/CSPdarknet.py new file mode 100644 index 0000000000000000000000000000000000000000..584690f1362c60d0605ad583a758145a2ae80c6c --- /dev/null +++ b/nets/CSPdarknet.py @@ -0,0 +1,140 @@ +import torch +import torch.nn.functional as F +import torch.nn as nn +import math +from collections import OrderedDict + +#-------------------------------------------------# +# MISH激活函数 +#-------------------------------------------------# +class Mish(nn.Module): + def __init__(self): + super(Mish, self).__init__() + + def forward(self, x): + return x * torch.tanh(F.softplus(x)) + +#-------------------------------------------------# +# 卷积块 +# CONV+BATCHNORM+MISH +#-------------------------------------------------# +class BasicConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1): + super(BasicConv, self).__init__() + + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, kernel_size//2, bias=False) + self.bn = nn.BatchNorm2d(out_channels) + self.activation = Mish() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.activation(x) + return x + +#---------------------------------------------------# +# CSPdarknet的结构块的组成部分 +# 内部堆叠的残差块 +#---------------------------------------------------# +class Resblock(nn.Module): + def __init__(self, channels, hidden_channels=None, residual_activation=nn.Identity()): + super(Resblock, self).__init__() + + if hidden_channels is None: + hidden_channels = channels + + self.block = nn.Sequential( + BasicConv(channels, hidden_channels, 1), + BasicConv(hidden_channels, channels, 3) + ) + + def forward(self, x): + return x + self.block(x) + +#---------------------------------------------------# +# CSPdarknet的结构块 +# 存在一个大残差边 +# 这个大残差边绕过了很多的残差结构 +#---------------------------------------------------# +class Resblock_body(nn.Module): + def __init__(self, in_channels, out_channels, num_blocks, first): + super(Resblock_body, self).__init__() + + self.downsample_conv = BasicConv(in_channels, out_channels, 3, stride=2) + + if first: + self.split_conv0 = BasicConv(out_channels, out_channels, 1) + self.split_conv1 = BasicConv(out_channels, out_channels, 1) + self.blocks_conv = nn.Sequential( + Resblock(channels=out_channels, hidden_channels=out_channels//2), + BasicConv(out_channels, out_channels, 1) + ) + self.concat_conv = BasicConv(out_channels*2, out_channels, 1) + else: + self.split_conv0 = BasicConv(out_channels, out_channels//2, 1) + self.split_conv1 = BasicConv(out_channels, out_channels//2, 1) + + self.blocks_conv = nn.Sequential( + *[Resblock(out_channels//2) for _ in range(num_blocks)], + BasicConv(out_channels//2, out_channels//2, 1) + ) + self.concat_conv = BasicConv(out_channels, out_channels, 1) + + def forward(self, x): + x = self.downsample_conv(x) + + x0 = self.split_conv0(x) + + x1 = self.split_conv1(x) + x1 = self.blocks_conv(x1) + + x = torch.cat([x1, x0], dim=1) + x = self.concat_conv(x) + + return x + +class CSPDarkNet(nn.Module): + def __init__(self, layers): + super(CSPDarkNet, self).__init__() + self.inplanes = 32 + self.conv1 = BasicConv(3, self.inplanes, kernel_size=3, stride=1) + self.feature_channels = [64, 128, 256, 512, 1024] + + self.stages = nn.ModuleList([ + Resblock_body(self.inplanes, self.feature_channels[0], layers[0], first=True), + Resblock_body(self.feature_channels[0], self.feature_channels[1], layers[1], first=False), + Resblock_body(self.feature_channels[1], self.feature_channels[2], layers[2], first=False), + Resblock_body(self.feature_channels[2], self.feature_channels[3], layers[3], first=False), + Resblock_body(self.feature_channels[3], self.feature_channels[4], layers[4], first=False) + ]) + + self.num_features = 1 + # 进行权值初始化 + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + + def forward(self, x): + x = self.conv1(x) + + x = self.stages[0](x) + x = self.stages[1](x) + out3 = self.stages[2](x) + out4 = self.stages[3](out3) + out5 = self.stages[4](out4) + + return out3, out4, out5 + +def darknet53(pretrained, **kwargs): + model = CSPDarkNet([1, 2, 8, 8, 4]) + if pretrained: + if isinstance(pretrained, str): + model.load_state_dict(torch.load(pretrained)) + else: + raise Exception("darknet request a pretrained path. got [{}]".format(pretrained)) + return model diff --git a/nets/yolo4.py b/nets/yolo4.py new file mode 100644 index 0000000000000000000000000000000000000000..2c8f62cee7c2fcd3ca8331b0186acfcd36f5aa76 --- /dev/null +++ b/nets/yolo4.py @@ -0,0 +1,150 @@ +import torch +import torch.nn as nn +from collections import OrderedDict +from nets.CSPdarknet import darknet53 + +def conv2d(filter_in, filter_out, kernel_size, stride=1): + pad = (kernel_size - 1) // 2 if kernel_size else 0 + return nn.Sequential(OrderedDict([ + ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad, bias=False)), + ("bn", nn.BatchNorm2d(filter_out)), + ("relu", nn.LeakyReLU(0.1)), + ])) + +#---------------------------------------------------# +# SPP结构,利用不同大小的池化核进行池化 +# 池化后堆叠 +#---------------------------------------------------# +class SpatialPyramidPooling(nn.Module): + def __init__(self, pool_sizes=[5, 9, 13]): + super(SpatialPyramidPooling, self).__init__() + + self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, pool_size//2) for pool_size in pool_sizes]) + + def forward(self, x): + features = [maxpool(x) for maxpool in self.maxpools[::-1]] + features = torch.cat(features + [x], dim=1) + + return features + +#---------------------------------------------------# +# 卷积 + 上采样 +#---------------------------------------------------# +class Upsample(nn.Module): + def __init__(self, in_channels, out_channels): + super(Upsample, self).__init__() + + self.upsample = nn.Sequential( + conv2d(in_channels, out_channels, 1), + nn.Upsample(scale_factor=2, mode='nearest') + ) + + def forward(self, x,): + x = self.upsample(x) + return x + +#---------------------------------------------------# +# 三次卷积块 +#---------------------------------------------------# +def make_three_conv(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + ) + return m + +#---------------------------------------------------# +# 五次卷积块 +#---------------------------------------------------# +def make_five_conv(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + ) + return m + +#---------------------------------------------------# +# 最后获得yolov4的输出 +#---------------------------------------------------# +def yolo_head(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 3), + nn.Conv2d(filters_list[0], filters_list[1], 1), + ) + return m + +#---------------------------------------------------# +# yolo_body +#---------------------------------------------------# +class YoloBody(nn.Module): + def __init__(self, num_anchors, num_classes): + super(YoloBody, self).__init__() + # backbone + self.backbone = darknet53(None) + + self.conv1 = make_three_conv([512,1024],1024) + self.SPP = SpatialPyramidPooling() + self.conv2 = make_three_conv([512,1024],2048) + + self.upsample1 = Upsample(512,256) + self.conv_for_P4 = conv2d(512,256,1) + self.make_five_conv1 = make_five_conv([256, 512],512) + + self.upsample2 = Upsample(256,128) + self.conv_for_P3 = conv2d(256,128,1) + self.make_five_conv2 = make_five_conv([128, 256],256) + # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75 + # 4+1+num_classes + final_out_filter2 = num_anchors * (5 + num_classes) + self.yolo_head3 = yolo_head([256, final_out_filter2],128) + + self.down_sample1 = conv2d(128,256,3,stride=2) + self.make_five_conv3 = make_five_conv([256, 512],512) + # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75 + final_out_filter1 = num_anchors * (5 + num_classes) + self.yolo_head2 = yolo_head([512, final_out_filter1],256) + + + self.down_sample2 = conv2d(256,512,3,stride=2) + self.make_five_conv4 = make_five_conv([512, 1024],1024) + # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75 + final_out_filter0 = num_anchors * (5 + num_classes) + self.yolo_head1 = yolo_head([1024, final_out_filter0],512) + + + def forward(self, x): + # backbone + x2, x1, x0 = self.backbone(x) + + P5 = self.conv1(x0) + P5 = self.SPP(P5) + P5 = self.conv2(P5) + + P5_upsample = self.upsample1(P5) + P4 = self.conv_for_P4(x1) + P4 = torch.cat([P4,P5_upsample],axis=1) + P4 = self.make_five_conv1(P4) + + P4_upsample = self.upsample2(P4) + P3 = self.conv_for_P3(x2) + P3 = torch.cat([P3,P4_upsample],axis=1) + P3 = self.make_five_conv2(P3) + + P3_downsample = self.down_sample1(P3) + P4 = torch.cat([P3_downsample,P4],axis=1) + P4 = self.make_five_conv3(P4) + + P4_downsample = self.down_sample2(P4) + P5 = torch.cat([P4_downsample,P5],axis=1) + P5 = self.make_five_conv4(P5) + + out2 = self.yolo_head3(P3) + out1 = self.yolo_head2(P4) + out0 = self.yolo_head1(P5) + + return out0, out1, out2 + diff --git a/nets/yolo_training.py b/nets/yolo_training.py new file mode 100644 index 0000000000000000000000000000000000000000..04678d7247782c35d63cad021407daeeccd3eda4 --- /dev/null +++ b/nets/yolo_training.py @@ -0,0 +1,507 @@ + +from random import shuffle +import numpy as np +import torch +import torch.nn as nn +import math +import torch.nn.functional as F +from matplotlib.colors import rgb_to_hsv, hsv_to_rgb +from PIL import Image +from utils.utils import bbox_iou, merge_bboxes + +#---------------------------------------------------# +# 平滑标签 +#---------------------------------------------------# +def smooth_labels(y_true, label_smoothing,num_classes): + return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes + +def box_ciou(b1, b2): + """ + 输入为: + ---------- + b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + + 返回为: + ------- + ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1) + """ + # 求出预测框左上角右下角 + b1_xy = b1[..., :2] + b1_wh = b1[..., 2:4] + b1_wh_half = b1_wh/2. + b1_mins = b1_xy - b1_wh_half + b1_maxes = b1_xy + b1_wh_half + # 求出真实框左上角右下角 + b2_xy = b2[..., :2] + b2_wh = b2[..., 2:4] + b2_wh_half = b2_wh/2. + b2_mins = b2_xy - b2_wh_half + b2_maxes = b2_xy + b2_wh_half + + # 求真实框和预测框所有的iou + intersect_mins = torch.max(b1_mins, b2_mins) + intersect_maxes = torch.min(b1_maxes, b2_maxes) + intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes)) + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + b1_area = b1_wh[..., 0] * b1_wh[..., 1] + b2_area = b2_wh[..., 0] * b2_wh[..., 1] + union_area = b1_area + b2_area - intersect_area + iou = intersect_area / (union_area + 1e-6) + + # 计算中心的差距 + center_distance = torch.sum(torch.pow((b1_xy - b2_xy), 2), axis=-1) + + # 找到包裹两个框的最小框的左上角和右下角 + enclose_mins = torch.min(b1_mins, b2_mins) + enclose_maxes = torch.max(b1_maxes, b2_maxes) + enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes)) + # 计算对角线距离 + enclose_diagonal = torch.sum(torch.pow(enclose_wh,2), axis=-1) + ciou = iou - 1.0 * (center_distance) / (enclose_diagonal + 1e-7) + + v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0]/b1_wh[..., 1]) - torch.atan(b2_wh[..., 0]/b2_wh[..., 1])), 2) + alpha = v / (1.0 - iou + v) + ciou = ciou - alpha * v + return ciou + +def clip_by_tensor(t,t_min,t_max): + t=t.float() + result = (t >= t_min).float() * t + (t < t_min).float() * t_min + result = (result <= t_max).float() * result + (result > t_max).float() * t_max + return result + +def MSELoss(pred,target): + return (pred-target)**2 + +def BCELoss(pred,target): + epsilon = 1e-7 + pred = clip_by_tensor(pred, epsilon, 1.0 - epsilon) + output = -target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred) + return output + +class YOLOLoss(nn.Module): + def __init__(self, anchors, num_classes, img_size, label_smooth=0, cuda=True): + super(YOLOLoss, self).__init__() + self.anchors = anchors + self.num_anchors = len(anchors) + self.num_classes = num_classes + self.bbox_attrs = 5 + num_classes + self.img_size = img_size + self.feature_length = [img_size[0]//32,img_size[0]//16,img_size[0]//8] + self.label_smooth = label_smooth + + self.ignore_threshold = 0.5 + self.lambda_conf = 1.0 + self.lambda_cls = 1.0 + self.lambda_loc = 1.0 + self.cuda = cuda + + def forward(self, input, targets=None): + # input为bs,3*(5+num_classes),13,13 + + # 一共多少张图片 + bs = input.size(0) + # 特征层的高 + in_h = input.size(2) + # 特征层的宽 + in_w = input.size(3) + + # 计算步长 + # 每一个特征点对应原来的图片上多少个像素点 + # 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点 + stride_h = self.img_size[1] / in_h + stride_w = self.img_size[0] / in_w + + # 把先验框的尺寸调整成特征层大小的形式 + # 计算出先验框在特征层上对应的宽高 + scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors] + # bs,3*(5+num_classes),13,13 -> bs,3,13,13,(5+num_classes) + prediction = input.view(bs, int(self.num_anchors/3), + self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous() + + # 对prediction预测进行调整 + conf = torch.sigmoid(prediction[..., 4]) # Conf + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + + # 找到哪些先验框内部包含物体 + mask, noobj_mask, t_box, tconf, tcls, box_loss_scale_x, box_loss_scale_y = self.get_target(targets, scaled_anchors,in_w, in_h,self.ignore_threshold) + + noobj_mask, pred_boxes_for_ciou = self.get_ignore(prediction, targets, scaled_anchors, in_w, in_h, noobj_mask) + + if self.cuda: + mask, noobj_mask = mask.cuda(), noobj_mask.cuda() + box_loss_scale_x, box_loss_scale_y= box_loss_scale_x.cuda(), box_loss_scale_y.cuda() + tconf, tcls = tconf.cuda(), tcls.cuda() + pred_boxes_for_ciou = pred_boxes_for_ciou.cuda() + t_box = t_box.cuda() + + box_loss_scale = 2-box_loss_scale_x*box_loss_scale_y + # losses. + ciou = (1 - box_ciou( pred_boxes_for_ciou[mask.bool()], t_box[mask.bool()]))* box_loss_scale[mask.bool()] + + loss_loc = torch.sum(ciou / bs) + loss_conf = torch.sum(BCELoss(conf, mask) * mask / bs) + \ + torch.sum(BCELoss(conf, mask) * noobj_mask / bs) + + # print(smooth_labels(tcls[mask == 1],self.label_smooth,self.num_classes)) + loss_cls = torch.sum(BCELoss(pred_cls[mask == 1], smooth_labels(tcls[mask == 1],self.label_smooth,self.num_classes))/bs) + # print(loss_loc,loss_conf,loss_cls) + loss = loss_conf * self.lambda_conf + loss_cls * self.lambda_cls + loss_loc * self.lambda_loc + return loss, loss_conf.item(), loss_cls.item(), loss_loc.item() + + def get_target(self, target, anchors, in_w, in_h, ignore_threshold): + # 计算一共有多少张图片 + bs = len(target) + # 获得先验框 + anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] + subtract_index = [0,3,6][self.feature_length.index(in_w)] + # 创建全是0或者全是1的阵列 + mask = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + noobj_mask = torch.ones(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + + tx = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + ty = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + tw = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + th = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + t_box = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, 4, requires_grad=False) + tconf = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + tcls = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, self.num_classes, requires_grad=False) + + box_loss_scale_x = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + box_loss_scale_y = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) + for b in range(bs): + for t in range(target[b].shape[0]): + # 计算出在特征层上的点位 + gx = target[b][t, 0] * in_w + gy = target[b][t, 1] * in_h + + gw = target[b][t, 2] * in_w + gh = target[b][t, 3] * in_h + + # 计算出属于哪个网格 + gi = int(gx) + gj = int(gy) + + # 计算真实框的位置 + gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0) + + # 计算出所有先验框的位置 + anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((self.num_anchors, 2)), + np.array(anchors)), 1)) + # 计算重合程度 + anch_ious = bbox_iou(gt_box, anchor_shapes) + + # Find the best matching anchor box + best_n = np.argmax(anch_ious) + if best_n not in anchor_index: + continue + # Masks + if (gj < in_h) and (gi < in_w): + best_n = best_n - subtract_index + # 判定哪些先验框内部真实的存在物体 + noobj_mask[b, best_n, gj, gi] = 0 + mask[b, best_n, gj, gi] = 1 + # 计算先验框中心调整参数 + tx[b, best_n, gj, gi] = gx + ty[b, best_n, gj, gi] = gy + # 计算先验框宽高调整参数 + tw[b, best_n, gj, gi] = gw + th[b, best_n, gj, gi] = gh + # 用于获得xywh的比例 + box_loss_scale_x[b, best_n, gj, gi] = target[b][t, 2] + box_loss_scale_y[b, best_n, gj, gi] = target[b][t, 3] + # 物体置信度 + tconf[b, best_n, gj, gi] = 1 + # 种类 + tcls[b, best_n, gj, gi, int(target[b][t, 4])] = 1 + else: + print('Step {0} out of bound'.format(b)) + print('gj: {0}, height: {1} | gi: {2}, width: {3}'.format(gj, in_h, gi, in_w)) + continue + t_box[...,0] = tx + t_box[...,1] = ty + t_box[...,2] = tw + t_box[...,3] = th + return mask, noobj_mask, t_box, tconf, tcls, box_loss_scale_x, box_loss_scale_y + + def get_ignore(self,prediction,target,scaled_anchors,in_w, in_h,noobj_mask): + bs = len(target) + anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] + scaled_anchors = np.array(scaled_anchors)[anchor_index] + # 先验框的中心位置的调整参数 + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + # 先验框的宽高调整参数 + w = prediction[..., 2] # Width + h = prediction[..., 3] # Height + + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + + # 生成网格,先验框中心,网格左上角 + grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_w, 1).repeat( + int(bs*self.num_anchors/3), 1, 1).view(x.shape).type(FloatTensor) + grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_h, 1).t().repeat( + int(bs*self.num_anchors/3), 1, 1).view(y.shape).type(FloatTensor) + + # 生成先验框的宽高 + anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) + anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) + + anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape) + anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape) + + # 计算调整后的先验框中心与宽高 + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x + grid_x + pred_boxes[..., 1] = y + grid_y + pred_boxes[..., 2] = torch.exp(w) * anchor_w + pred_boxes[..., 3] = torch.exp(h) * anchor_h + for i in range(bs): + pred_boxes_for_ignore = pred_boxes[i] + pred_boxes_for_ignore = pred_boxes_for_ignore.view(-1, 4) + + for t in range(target[i].shape[0]): + gx = target[i][t, 0] * in_w + gy = target[i][t, 1] * in_h + gw = target[i][t, 2] * in_w + gh = target[i][t, 3] * in_h + gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0).type(FloatTensor) + + anch_ious = bbox_iou(gt_box, pred_boxes_for_ignore, x1y1x2y2=False) + anch_ious = anch_ious.view(pred_boxes[i].size()[:3]) + noobj_mask[i][anch_ious>self.ignore_threshold] = 0 + return noobj_mask, pred_boxes + + +def rand(a=0, b=1): + return np.random.rand()*(b-a) + a + + +class Generator(object): + def __init__(self,batch_size, + train_lines, image_size, + ): + + self.batch_size = batch_size + self.train_lines = train_lines + self.train_batches = len(train_lines) + self.image_size = image_size + + def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5): + '''r实时数据增强的随机预处理''' + line = annotation_line.split() + image = Image.open(line[0]) + iw, ih = image.size + h, w = input_shape + box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) + + # resize image + new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter) + scale = rand(.25, 2) + if new_ar < 1: + nh = int(scale*h) + nw = int(nh*new_ar) + else: + nw = int(scale*w) + nh = int(nw/new_ar) + image = image.resize((nw,nh), Image.BICUBIC) + + # place image + dx = int(rand(0, w-nw)) + dy = int(rand(0, h-nh)) + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image = new_image + + # flip image or not + flip = rand()<.5 + if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) + + # distort image + hue = rand(-hue, hue) + sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat) + val = rand(1, val) if rand()<.5 else 1/rand(1, val) + x = rgb_to_hsv(np.array(image)/255.) + x[..., 0] += hue + x[..., 0][x[..., 0]>1] -= 1 + x[..., 0][x[..., 0]<0] += 1 + x[..., 1] *= sat + x[..., 2] *= val + x[x>1] = 1 + x[x<0] = 0 + image_data = hsv_to_rgb(x)*255 # numpy array, 0 to 1 + + # correct boxes + box_data = np.zeros((len(box),5)) + if len(box)>0: + np.random.shuffle(box) + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + if flip: box[:, [0,2]] = w - box[:, [2,0]] + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box + box_data = np.zeros((len(box),5)) + box_data[:len(box)] = box + if len(box) == 0: + return image_data, [] + + if (box_data[:,:4]>0).any(): + return image_data, box_data + else: + return image_data, [] + + def get_random_data_with_Mosaic(self, annotation_line, input_shape, hue=.1, sat=1.5, val=1.5): + '''random preprocessing for real-time data augmentation''' + h, w = input_shape + min_offset_x = 0.4 + min_offset_y = 0.4 + scale_low = 1-min(min_offset_x,min_offset_y) + scale_high = scale_low+0.2 + + image_datas = [] + box_datas = [] + index = 0 + + place_x = [0,0,int(w*min_offset_x),int(w*min_offset_x)] + place_y = [0,int(h*min_offset_y),int(w*min_offset_y),0] + for line in annotation_line: + # 每一行进行分割 + line_content = line.split() + # 打开图片 + image = Image.open(line_content[0]) + image = image.convert("RGB") + # 图片的大小 + iw, ih = image.size + # 保存框的位置 + box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]]) + + # 是否翻转图片 + flip = rand()<.5 + if flip and len(box)>0: + image = image.transpose(Image.FLIP_LEFT_RIGHT) + box[:, [0,2]] = iw - box[:, [2,0]] + + # 对输入进来的图片进行缩放 + new_ar = w/h + scale = rand(scale_low, scale_high) + if new_ar < 1: + nh = int(scale*h) + nw = int(nh*new_ar) + else: + nw = int(scale*w) + nh = int(nw/new_ar) + image = image.resize((nw,nh), Image.BICUBIC) + + # 进行色域变换 + hue = rand(-hue, hue) + sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat) + val = rand(1, val) if rand()<.5 else 1/rand(1, val) + x = rgb_to_hsv(np.array(image)/255.) + x[..., 0] += hue + x[..., 0][x[..., 0]>1] -= 1 + x[..., 0][x[..., 0]<0] += 1 + x[..., 1] *= sat + x[..., 2] *= val + x[x>1] = 1 + x[x<0] = 0 + image = hsv_to_rgb(x) + + image = Image.fromarray((image*255).astype(np.uint8)) + # 将图片进行放置,分别对应四张分割图片的位置 + dx = place_x[index] + dy = place_y[index] + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image_data = np.array(new_image)/255 + + + index = index + 1 + box_data = [] + # 对box进行重新处理 + if len(box)>0: + np.random.shuffle(box) + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w>1, box_h>1)] + box_data = np.zeros((len(box),5)) + box_data[:len(box)] = box + + image_datas.append(image_data) + box_datas.append(box_data) + + # 将图片分割,放在一起 + cutx = np.random.randint(int(w*min_offset_x), int(w*(1 - min_offset_x))) + cuty = np.random.randint(int(h*min_offset_y), int(h*(1 - min_offset_y))) + + new_image = np.zeros([h,w,3]) + new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :] + new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :] + new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :] + new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :] + + # 对框进行进一步的处理 + new_boxes = np.array(merge_bboxes(box_datas, cutx, cuty)) + + if len(new_boxes) == 0: + return new_image, [] + if (new_boxes[:,:4]>0).any(): + return new_image, new_boxes + else: + return new_image, [] + + def generate(self, train = True, mosaic = True): + while True: + shuffle(self.train_lines) + lines = self.train_lines + inputs = [] + targets = [] + flag = True + n = len(lines) + for i in range(len(lines)): + if mosaic == True: + if flag and (i+4) < n: + img,y = self.get_random_data_with_Mosaic(lines[i:i+4], self.image_size[0:2]) + i = (i+4) % n + else: + img,y = self.get_random_data(lines[i], self.image_size[0:2]) + i = (i+1) % n + flag = bool(1-flag) + else: + img,y = self.get_random_data(lines[i], self.image_size[0:2]) + i = (i+1) % n + if len(y)==0: + continue + boxes = np.array(y[:,:4],dtype=np.float32) + boxes[:,0] = boxes[:,0]/self.image_size[1] + boxes[:,1] = boxes[:,1]/self.image_size[0] + boxes[:,2] = boxes[:,2]/self.image_size[1] + boxes[:,3] = boxes[:,3]/self.image_size[0] + + boxes = np.maximum(np.minimum(boxes,1),0) + boxes[:,2] = boxes[:,2] - boxes[:,0] + boxes[:,3] = boxes[:,3] - boxes[:,1] + + boxes[:,0] = boxes[:,0] + boxes[:,2]/2 + boxes[:,1] = boxes[:,1] + boxes[:,3]/2 + y = np.concatenate([boxes,y[:,-1:]],axis=-1) + img = np.array(img,dtype = np.float32) + + inputs.append(np.transpose(img/255.0,(2,0,1))) + targets.append(y) + if len(targets) == self.batch_size: + tmp_inp = np.array(inputs) + tmp_targets = np.array(targets) + inputs = [] + targets = [] + yield tmp_inp, tmp_targets \ No newline at end of file diff --git a/predict.py b/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..07c7406a3cab29426d9621d755bca0d0c7c78b50 --- /dev/null +++ b/predict.py @@ -0,0 +1,18 @@ +#-------------------------------------# +# 对单张图片进行预测 +#-------------------------------------# +from yolo import YOLO +from PIL import Image + +yolo = YOLO() + +while True: + img = input('Input image filename:') + try: + image = Image.open(img) + except: + print('Open Error! Try again!') + continue + else: + r_image = yolo.detect_image(image) + r_image.show() diff --git a/test.py b/test.py new file mode 100644 index 0000000000000000000000000000000000000000..a9c88c16abdb4cdb87901197b706b3aad6578368 --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ +import torch +from torchsummary import summary +from nets.CSPdarknet import darknet53 +from nets.yolo4 import YoloBody + +if __name__ == "__main__": + # 需要使用device来指定网络在GPU还是CPU运行 + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model = YoloBody(3,20).to(device) + summary(model, input_size=(3, 416, 416)) diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..3cabe85eea96c66af6a27ffab6df7f9b4eab21d8 --- /dev/null +++ b/train.py @@ -0,0 +1,207 @@ +#-------------------------------------# +# 对数据集进行训练 +#-------------------------------------# +import os +import numpy as np +import time +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torch.backends.cudnn as cudnn +from nets.yolo_training import YOLOLoss,Generator +from nets.yolo4 import YoloBody + + +#---------------------------------------------------# +# 获得类和先验框 +#---------------------------------------------------# +def get_classes(classes_path): + '''loads the classes''' + with open(classes_path) as f: + class_names = f.readlines() + class_names = [c.strip() for c in class_names] + return class_names + +def get_anchors(anchors_path): + '''loads the anchors from a file''' + with open(anchors_path) as f: + anchors = f.readline() + anchors = [float(x) for x in anchors.split(',')] + return np.array(anchors).reshape([-1,3,2])[::-1,:,:] + +def fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,cuda): + total_loss = 0 + val_loss = 0 + for iteration in range(epoch_size): + start_time = time.time() + images, targets = next(gen) + with torch.no_grad(): + if cuda: + images = Variable(torch.from_numpy(images).type(torch.FloatTensor)).cuda() + targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets] + else: + images = Variable(torch.from_numpy(images).type(torch.FloatTensor)) + targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets] + # print(images) + optimizer.zero_grad() + outputs = net(images) + losses = [] + for i in range(3): + loss_item = yolo_losses[i](outputs[i], targets) + losses.append(loss_item[0]) + loss = sum(losses) + loss.backward() + optimizer.step() + + total_loss += loss + waste_time = time.time() - start_time + print('\nEpoch:'+ str(epoch+1) + '/' + str(Epoch)) + print('iter:' + str(iteration) + '/' + str(epoch_size) + ' || Total Loss: %.4f || %.4fs/step' % (total_loss/(iteration+1),waste_time)) + + print('Start Validation') + for iteration in range(epoch_size_val): + images_val, targets_val = next(genval) + + with torch.no_grad(): + if cuda: + images = Variable(torch.from_numpy(images).cuda().type(torch.FloatTensor)) + targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets] + else: + images = Variable(torch.from_numpy(images).type(torch.FloatTensor)) + targets = [Variable(torch.from_numpy(ann).type(torch.FloatTensor)) for ann in targets] + optimizer.zero_grad() + outputs = net(images_val) + losses = [] + for i in range(3): + loss_item = yolo_losses[i](outputs[i], targets_val) + losses.append(loss_item[0]) + loss = sum(losses) + val_loss += loss + print('Finish Validation') + print('\nEpoch:'+ str(epoch+1) + '/' + str(Epoch)) + print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) + + print('Saving state, iter:', str(epoch+1)) + torch.save(model.state_dict(), 'logs/Epoch%d-Total_Loss%.4f-Val_Loss%.4f.pth'%((epoch+1),total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) + + +if __name__ == "__main__": + #-------------------------------# + # 输入的shape大小 + # 显存比较小可以使用416x416 + # 显存比较大可以使用608x608 + #-------------------------------# + input_shape = (416,416) + #-------------------------------# + # tricks的使用设置 + #-------------------------------# + Cosine_lr = False + mosaic = True + # 用于设定是否使用cuda + Cuda = True + smoooth_label = 0 + + annotation_path = '2007_train.txt' + #-------------------------------# + # 获得先验框和类 + #-------------------------------# + anchors_path = 'model_data/yolo_anchors.txt' + classes_path = 'model_data/voc_classes.txt' + class_names = get_classes(classes_path) + anchors = get_anchors(anchors_path) + num_classes = len(class_names) + + # 创建模型 + model = YoloBody(len(anchors[0]),num_classes) + model_path = "model_data/yolo4_weights.pth" + # 加快模型训练的效率 + print('Loading weights into state dict...') + model_dict = model.state_dict() + pretrained_dict = torch.load(model_path) + pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)} + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict) + print('Finished!') + + net = model.train() + + if Cuda: + net = torch.nn.DataParallel(model) + cudnn.benchmark = True + net = net.cuda() + + # 建立loss函数 + yolo_losses = [] + for i in range(3): + yolo_losses.append(YOLOLoss(np.reshape(anchors,[-1,2]),num_classes, \ + (input_shape[1], input_shape[0]), smoooth_label, Cuda)) + + # 0.1用于验证,0.9用于训练 + val_split = 0.1 + with open(annotation_path) as f: + lines = f.readlines() + np.random.seed(10101) + np.random.shuffle(lines) + np.random.seed(None) + num_val = int(len(lines)*val_split) + num_train = len(lines) - num_val + + if True: + lr = 1e-3 + Batch_size = 4 + Init_Epoch = 0 + Freeze_Epoch = 25 + + optimizer = optim.Adam(net.parameters(),lr) + if Cosine_lr: + lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5) + else: + lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95) + + gen = Generator(Batch_size, lines[:num_train], + (input_shape[0], input_shape[1])).generate(mosaic = mosaic) + gen_val = Generator(Batch_size, lines[num_train:], + (input_shape[0], input_shape[1])).generate(mosaic = False) + + epoch_size = int(max(1, num_train//Batch_size//2.5)) if mosaic else max(1, num_train//Batch_size) + epoch_size_val = num_val//Batch_size + #------------------------------------# + # 冻结一定部分训练 + #------------------------------------# + for param in model.backbone.parameters(): + param.requires_grad = False + + for epoch in range(Init_Epoch,Freeze_Epoch): + fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,gen_val,Freeze_Epoch,Cuda) + lr_scheduler.step() + + if True: + lr = 1e-4 + Batch_size = 2 + Freeze_Epoch = 25 + Unfreeze_Epoch = 50 + + optimizer = optim.Adam(net.parameters(),lr) + if Cosine_lr: + lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5) + else: + lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=1,gamma=0.95) + + gen = Generator(Batch_size, lines[:num_train], + (input_shape[0], input_shape[1])).generate(mosaic = mosaic) + gen_val = Generator(Batch_size, lines[num_train:], + (input_shape[0], input_shape[1])).generate(mosaic = False) + + epoch_size = int(max(1, num_train//Batch_size//2.5)) if mosaic else max(1, num_train//Batch_size) + epoch_size_val = num_val//Batch_size + #------------------------------------# + # 解冻后训练 + #------------------------------------# + for param in model.backbone.parameters(): + param.requires_grad = True + + for epoch in range(Freeze_Epoch,Unfreeze_Epoch): + fit_ont_epoch(net,yolo_losses,epoch,epoch_size,epoch_size_val,gen,gen_val,Unfreeze_Epoch,Cuda) + lr_scheduler.step() \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a19ef8fd19d26ef6947a4804f6805ce28c20ecac --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,320 @@ +from __future__ import division +import os +import math +import time +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import numpy as np +from PIL import Image, ImageDraw, ImageFont +import matplotlib.pyplot as plt + +class DecodeBox(nn.Module): + def __init__(self, anchors, num_classes, img_size): + super(DecodeBox, self).__init__() + self.anchors = anchors + self.num_anchors = len(anchors) + self.num_classes = num_classes + self.bbox_attrs = 5 + num_classes + self.img_size = img_size + + def forward(self, input): + # input为bs,3*(1+4+num_classes),13,13 + + # 一共多少张图片 + batch_size = input.size(0) + # 13,13 + input_height = input.size(2) + input_width = input.size(3) + + # 计算步长 + # 每一个特征点对应原来的图片上多少个像素点 + # 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点 + # 416/13 = 32 + stride_h = self.img_size[1] / input_height + stride_w = self.img_size[0] / input_width + + # 把先验框的尺寸调整成特征层大小的形式 + # 计算出先验框在特征层上对应的宽高 + scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors] + + # bs,3*(5+num_classes),13,13 -> bs,3,13,13,(5+num_classes) + prediction = input.view(batch_size, self.num_anchors, + self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous() + + # 先验框的中心位置的调整参数 + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + # 先验框的宽高调整参数 + w = prediction[..., 2] # Width + h = prediction[..., 3] # Height + + # 获得置信度,是否有物体 + conf = torch.sigmoid(prediction[..., 4]) + # 种类置信度 + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + + # 生成网格,先验框中心,网格左上角 batch_size,3,13,13 + grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_width, 1).repeat( + batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) + grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_height, 1).t().repeat( + batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) + + # 生成先验框的宽高 + anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) + anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) + anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape) + anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape) + + # 计算调整后的先验框中心与宽高 + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data + grid_x + pred_boxes[..., 1] = y.data + grid_y + pred_boxes[..., 2] = torch.exp(w.data) * anchor_w + pred_boxes[..., 3] = torch.exp(h.data) * anchor_h + + # fig = plt.figure() + # ax = fig.add_subplot(121) + # if input_height==13: + # plt.ylim(0,13) + # plt.xlim(0,13) + # elif input_height==26: + # plt.ylim(0,26) + # plt.xlim(0,26) + # elif input_height==52: + # plt.ylim(0,52) + # plt.xlim(0,52) + # plt.scatter(grid_x.cpu(),grid_y.cpu()) + + # anchor_left = grid_x - anchor_w/2 + # anchor_top = grid_y - anchor_h/2 + + # rect1 = plt.Rectangle([anchor_left[0,0,5,5],anchor_top[0,0,5,5]],anchor_w[0,0,5,5],anchor_h[0,0,5,5],color="r",fill=False) + # rect2 = plt.Rectangle([anchor_left[0,1,5,5],anchor_top[0,1,5,5]],anchor_w[0,1,5,5],anchor_h[0,1,5,5],color="r",fill=False) + # rect3 = plt.Rectangle([anchor_left[0,2,5,5],anchor_top[0,2,5,5]],anchor_w[0,2,5,5],anchor_h[0,2,5,5],color="r",fill=False) + + # ax.add_patch(rect1) + # ax.add_patch(rect2) + # ax.add_patch(rect3) + + # ax = fig.add_subplot(122) + # if input_height==13: + # plt.ylim(0,13) + # plt.xlim(0,13) + # elif input_height==26: + # plt.ylim(0,26) + # plt.xlim(0,26) + # elif input_height==52: + # plt.ylim(0,52) + # plt.xlim(0,52) + # plt.scatter(grid_x.cpu(),grid_y.cpu()) + # plt.scatter(pred_boxes[0,:,5,5,0].cpu(),pred_boxes[0,:,5,5,1].cpu(),c='r') + + # pre_left = pred_boxes[...,0] - pred_boxes[...,2]/2 + # pre_top = pred_boxes[...,1] - pred_boxes[...,3]/2 + + # rect1 = plt.Rectangle([pre_left[0,0,5,5],pre_top[0,0,5,5]],pred_boxes[0,0,5,5,2],pred_boxes[0,0,5,5,3],color="r",fill=False) + # rect2 = plt.Rectangle([pre_left[0,1,5,5],pre_top[0,1,5,5]],pred_boxes[0,1,5,5,2],pred_boxes[0,1,5,5,3],color="r",fill=False) + # rect3 = plt.Rectangle([pre_left[0,2,5,5],pre_top[0,2,5,5]],pred_boxes[0,2,5,5,2],pred_boxes[0,2,5,5,3],color="r",fill=False) + + # ax.add_patch(rect1) + # ax.add_patch(rect2) + # ax.add_patch(rect3) + + # plt.show() + # 用于将输出调整为相对于416x416的大小 + _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor) + output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale, + conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1) + return output.data + +def letterbox_image(image, size): + iw, ih = image.size + w, h = size + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', size, (128,128,128)) + new_image.paste(image, ((w-nw)//2, (h-nh)//2)) + return new_image + +def yolo_correct_boxes(top, left, bottom, right, input_shape, image_shape): + new_shape = image_shape*np.min(input_shape/image_shape) + + offset = (input_shape-new_shape)/2./input_shape + scale = input_shape/new_shape + + box_yx = np.concatenate(((top+bottom)/2,(left+right)/2),axis=-1)/input_shape + box_hw = np.concatenate((bottom-top,right-left),axis=-1)/input_shape + + box_yx = (box_yx - offset) * scale + box_hw *= scale + + box_mins = box_yx - (box_hw / 2.) + box_maxes = box_yx + (box_hw / 2.) + boxes = np.concatenate([ + box_mins[:, 0:1], + box_mins[:, 1:2], + box_maxes[:, 0:1], + box_maxes[:, 1:2] + ],axis=-1) + print(np.shape(boxes)) + boxes *= np.concatenate([image_shape, image_shape],axis=-1) + return boxes + +def bbox_iou(box1, box2, x1y1x2y2=True): + """ + 计算IOU + """ + if not x1y1x2y2: + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + inter_rect_x1 = torch.max(b1_x1, b2_x1) + inter_rect_y1 = torch.max(b1_y1, b2_y1) + inter_rect_x2 = torch.min(b1_x2, b2_x2) + inter_rect_y2 = torch.min(b1_y2, b2_y2) + + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * \ + torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0) + + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + +def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): + # 求左上角和右下角 + box_corner = prediction.new(prediction.shape) + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + prediction[:, :, :4] = box_corner[:, :, :4] + + output = [None for _ in range(len(prediction))] + for image_i, image_pred in enumerate(prediction): + # 利用置信度进行第一轮筛选 + conf_mask = (image_pred[:, 4] >= conf_thres).squeeze() + image_pred = image_pred[conf_mask] + + if not image_pred.size(0): + continue + + # 获得种类及其置信度 + class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) + + # 获得的内容为(x1, y1, x2, y2, obj_conf, class_conf, class_pred) + detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1) + + # 获得种类 + unique_labels = detections[:, -1].cpu().unique() + + if prediction.is_cuda: + unique_labels = unique_labels.cuda() + + for c in unique_labels: + # 获得某一类初步筛选后全部的预测结果 + detections_class = detections[detections[:, -1] == c] + # 按照存在物体的置信度排序 + _, conf_sort_index = torch.sort(detections_class[:, 4], descending=True) + detections_class = detections_class[conf_sort_index] + # 进行非极大抑制 + max_detections = [] + while detections_class.size(0): + # 取出这一类置信度最高的,一步一步往下判断,判断重合程度是否大于nms_thres,如果是则去除掉 + max_detections.append(detections_class[0].unsqueeze(0)) + if len(detections_class) == 1: + break + ious = bbox_iou(max_detections[-1], detections_class[1:]) + detections_class = detections_class[1:][ious < nms_thres] + # 堆叠 + max_detections = torch.cat(max_detections).data + # Add max detections to outputs + output[image_i] = max_detections if output[image_i] is None else torch.cat( + (output[image_i], max_detections)) + + return output + +def merge_bboxes(bboxes, cutx, cuty): + merge_bbox = [] + for i in range(len(bboxes)): + for box in bboxes[i]: + tmp_box = [] + x1,y1,x2,y2 = box[0], box[1], box[2], box[3] + + if i == 0: + if y1 > cuty or x1 > cutx: + continue + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if y2-y1 < 5: + continue + if x2 >= cutx and x1 <= cutx: + x2 = cutx + if x2-x1 < 5: + continue + + if i == 1: + if y2 < cuty or x1 > cutx: + continue + + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if y2-y1 < 5: + continue + + if x2 >= cutx and x1 <= cutx: + x2 = cutx + if x2-x1 < 5: + continue + + if i == 2: + if y2 < cuty or x2 < cutx: + continue + + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if y2-y1 < 5: + continue + + if x2 >= cutx and x1 <= cutx: + x1 = cutx + if x2-x1 < 5: + continue + + if i == 3: + if y1 > cuty or x2 < cutx: + continue + + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if y2-y1 < 5: + continue + + if x2 >= cutx and x1 <= cutx: + x1 = cutx + if x2-x1 < 5: + continue + + tmp_box.append(x1) + tmp_box.append(y1) + tmp_box.append(x2) + tmp_box.append(y2) + tmp_box.append(box[-1]) + merge_bbox.append(tmp_box) + return merge_bbox \ No newline at end of file diff --git a/video.py b/video.py new file mode 100644 index 0000000000000000000000000000000000000000..4cf4308dd363b0a3579f676ded6910ab4d14b76f --- /dev/null +++ b/video.py @@ -0,0 +1,39 @@ +#-------------------------------------# +# 调用摄像头检测 +#-------------------------------------# +from yolo import YOLO +from PIL import Image +import numpy as np +import cv2 +import time +yolo = YOLO() +# 调用摄像头 +capture=cv2.VideoCapture(0) # capture=cv2.VideoCapture("1.mp4") + +fps = 0.0 +while(True): + t1 = time.time() + # 读取某一帧 + ref,frame=capture.read() + # 格式转变,BGRtoRGB + frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) + # 转变成Image + frame = Image.fromarray(np.uint8(frame)) + + # 进行检测 + frame = np.array(yolo.detect_image(frame)) + + # RGBtoBGR满足opencv显示格式 + frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR) + + fps = ( fps + (1./(time.time()-t1)) ) / 2 + print("fps= %.2f"%(fps)) + frame = cv2.putText(frame, "fps= %.2f"%(fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) + + cv2.imshow("video",frame) + + + c= cv2.waitKey(30) & 0xff + if c==27: + capture.release() + break diff --git a/voc_annotation.py b/voc_annotation.py new file mode 100644 index 0000000000000000000000000000000000000000..3243058997cef4a411b7be90d50db4d6f10f7ab9 --- /dev/null +++ b/voc_annotation.py @@ -0,0 +1,33 @@ +import xml.etree.ElementTree as ET +from os import getcwd + +sets=[('2007', 'train'), ('2007', 'val'), ('2007', 'test')] + +wd = getcwd() +classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] + +def convert_annotation(year, image_id, list_file): + in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id)) + tree=ET.parse(in_file) + root = tree.getroot() + if root.find('object')==None: + return + list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg'%(wd, year, image_id)) + for obj in root.iter('object'): + difficult = obj.find('difficult').text + cls = obj.find('name').text + if cls not in classes or int(difficult)==1: + continue + cls_id = classes.index(cls) + xmlbox = obj.find('bndbox') + b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text), int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text)) + list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id)) + + list_file.write('\n') + +for year, image_set in sets: + image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split() + list_file = open('%s_%s.txt'%(year, image_set), 'w') + for image_id in image_ids: + convert_annotation(year, image_id, list_file) + list_file.close() diff --git a/yolo.py b/yolo.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed9456ab5a32ac682e0ccd0c7617255723ece54 --- /dev/null +++ b/yolo.py @@ -0,0 +1,175 @@ +#-------------------------------------# +# 创建YOLO类 +#-------------------------------------# +import cv2 +import numpy as np +import colorsys +import os +import torch +import torch.nn as nn +from nets.yolo4 import YoloBody +import torch.backends.cudnn as cudnn +from PIL import Image,ImageFont, ImageDraw +from torch.autograd import Variable +from utils.utils import non_max_suppression, bbox_iou, DecodeBox,letterbox_image,yolo_correct_boxes + +class YOLO(object): + _defaults = { + "model_path": 'model_data/yolo4_voc_weights.pth', + "anchors_path": 'model_data/yolo_anchors.txt', + "classes_path": 'model_data/voc_classes.txt', + "model_image_size" : (416, 416, 3), + "confidence": 0.5, + "cuda": True + } + + @classmethod + def get_defaults(cls, n): + if n in cls._defaults: + return cls._defaults[n] + else: + return "Unrecognized attribute name '" + n + "'" + + #---------------------------------------------------# + # 初始化YOLO + #---------------------------------------------------# + def __init__(self, **kwargs): + self.__dict__.update(self._defaults) + self.class_names = self._get_class() + self.anchors = self._get_anchors() + self.generate() + #---------------------------------------------------# + # 获得所有的分类 + #---------------------------------------------------# + def _get_class(self): + classes_path = os.path.expanduser(self.classes_path) + with open(classes_path) as f: + class_names = f.readlines() + class_names = [c.strip() for c in class_names] + return class_names + + #---------------------------------------------------# + # 获得所有的先验框 + #---------------------------------------------------# + def _get_anchors(self): + anchors_path = os.path.expanduser(self.anchors_path) + with open(anchors_path) as f: + anchors = f.readline() + anchors = [float(x) for x in anchors.split(',')] + return np.array(anchors).reshape([-1, 3, 2])[::-1,:,:] + + #---------------------------------------------------# + # 获得所有的分类 + #---------------------------------------------------# + def generate(self): + os.environ["CUDA_VISIBLE_DEVICES"] = '0' + self.net = YoloBody(len(self.anchors[0]),len(self.class_names)).eval() + + # 加快模型训练的效率 + print('Loading weights into state dict...') + state_dict = torch.load(self.model_path) + self.net.load_state_dict(state_dict) + self.net = nn.DataParallel(self.net) + if self.cuda: + self.net = self.net.cuda() + + print('Finished!') + + self.yolo_decodes = [] + for i in range(3): + self.yolo_decodes.append(DecodeBox(self.anchors[i], len(self.class_names), (self.model_image_size[1], self.model_image_size[0]))) + + + print('{} model, anchors, and classes loaded.'.format(self.model_path)) + # 画框设置不同的颜色 + hsv_tuples = [(x / len(self.class_names), 1., 1.) + for x in range(len(self.class_names))] + self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + self.colors = list( + map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), + self.colors)) + + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self, image): + image_shape = np.array(np.shape(image)[0:2]) + + crop_img = np.array(letterbox_image(image, (self.model_image_size[0],self.model_image_size[1]))) + photo = np.array(crop_img,dtype = np.float32) + photo /= 255.0 + photo = np.transpose(photo, (2, 0, 1)) + photo = photo.astype(np.float32) + images = [] + images.append(photo) + images = np.asarray(images) + + with torch.no_grad(): + images = torch.from_numpy(images) + if self.cuda: + images = images.cuda() + outputs = self.net(images) + + output_list = [] + for i in range(3): + output_list.append(self.yolo_decodes[i](outputs[i])) + output = torch.cat(output_list, 1) + batch_detections = non_max_suppression(output, len(self.class_names), + conf_thres=self.confidence, + nms_thres=0.3) + try: + batch_detections = batch_detections[0].cpu().numpy() + except: + return image + + top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence + top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] + top_label = np.array(batch_detections[top_index,-1],np.int32) + top_bboxes = np.array(batch_detections[top_index,:4]) + top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) + + # 去掉灰条 + boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) + + font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32')) + + thickness = (np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0] + + for i, c in enumerate(top_label): + predicted_class = self.class_names[c] + score = top_conf[i] + + top, left, bottom, right = boxes[i] + top = top - 5 + left = left - 5 + bottom = bottom + 5 + right = right + 5 + + top = max(0, np.floor(top + 0.5).astype('int32')) + left = max(0, np.floor(left + 0.5).astype('int32')) + bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32')) + right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32')) + + # 画框框 + label = '{} {:.2f}'.format(predicted_class, score) + draw = ImageDraw.Draw(image) + label_size = draw.textsize(label, font) + label = label.encode('utf-8') + print(label) + + if top - label_size[1] >= 0: + text_origin = np.array([left, top - label_size[1]]) + else: + text_origin = np.array([left, top + 1]) + + for i in range(thickness): + draw.rectangle( + [left + i, top + i, right - i, bottom - i], + outline=self.colors[self.class_names.index(predicted_class)]) + draw.rectangle( + [tuple(text_origin), tuple(text_origin + label_size)], + fill=self.colors[self.class_names.index(predicted_class)]) + draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font) + del draw + return image +