diff --git a/get_map.py b/get_map.py index e4b58092f4c8d5d72a011772add0da4551698a0f..1d892747277458d4ad9f0830adecf206be35045a 100644 --- a/get_map.py +++ b/get_map.py @@ -1,901 +1,113 @@ -import glob -import json -import os -import shutil -import operator -import sys -import argparse -import math - -import numpy as np - -''' -用于计算mAP -代码克隆自https://github.com/Cartucho/mAP -如果想要设定mAP0.x,比如计算mAP0.75,可以设定MINOVERLAP = 0.75。 -''' -MINOVERLAP = 0.5 - -parser = argparse.ArgumentParser() -parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true") -parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true") -parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true") -parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.") -parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.") -args = parser.parse_args() - -''' - 0,0 ------> x (width) - | - | (Left,Top) - | *_________ - | | | - | | - y |_________| - (height) * - (Right,Bottom) -''' - -if args.ignore is None: - args.ignore = [] - -specific_iou_flagged = False -if args.set_class_iou is not None: - specific_iou_flagged = True - -os.chdir(os.path.dirname(os.path.abspath(__file__))) - -GT_PATH = os.path.join(os.getcwd(), 'input', 'ground-truth') -DR_PATH = os.path.join(os.getcwd(), 'input', 'detection-results') -IMG_PATH = os.path.join(os.getcwd(), 'input', 'images-optional') -if os.path.exists(IMG_PATH): - for dirpath, dirnames, files in os.walk(IMG_PATH): - if not files: - args.no_animation = True -else: - args.no_animation = True - -show_animation = False -if not args.no_animation: - try: - import cv2 - show_animation = True - except ImportError: - print("\"opencv-python\" not found, please install to visualize the results.") - args.no_animation = True - -draw_plot = False -if not args.no_plot: - try: - import matplotlib.pyplot as plt - draw_plot = True - except ImportError: - print("\"matplotlib\" not found, please install it to get the resulting plots.") - args.no_plot = True - - -def log_average_miss_rate(precision, fp_cumsum, num_images): - """ - log-average miss rate: - Calculated by averaging miss rates at 9 evenly spaced FPPI points - between 10e-2 and 10e0, in log-space. - - output: - lamr | log-average miss rate - mr | miss rate - fppi | false positives per image - - references: - [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the - State of the Art." Pattern Analysis and Machine Intelligence, IEEE - Transactions on 34.4 (2012): 743 - 761. - """ - - if precision.size == 0: - lamr = 0 - mr = 1 - fppi = 0 - return lamr, mr, fppi - - fppi = fp_cumsum / float(num_images) - mr = (1 - precision) - - fppi_tmp = np.insert(fppi, 0, -1.0) - mr_tmp = np.insert(mr, 0, 1.0) - - ref = np.logspace(-2.0, 0.0, num = 9) - for i, ref_i in enumerate(ref): - j = np.where(fppi_tmp <= ref_i)[-1][-1] - ref[i] = mr_tmp[j] - - lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) - - return lamr, mr, fppi - -""" - throw error and exit -""" -def error(msg): - print(msg) - sys.exit(0) - -""" - check if the number is a float between 0.0 and 1.0 -""" -def is_float_between_0_and_1(value): - try: - val = float(value) - if val > 0.0 and val < 1.0: - return True - else: - return False - except ValueError: - return False - -""" - Calculate the AP given the recall and precision array - 1st) We compute a version of the measured precision/recall curve with - precision monotonically decreasing - 2nd) We compute the AP as the area under this curve by numerical integration. -""" -def voc_ap(rec, prec): - """ - --- Official matlab code VOC2012--- - mrec=[0 ; rec ; 1]; - mpre=[0 ; prec ; 0]; - for i=numel(mpre)-1:-1:1 - mpre(i)=max(mpre(i),mpre(i+1)); - end - i=find(mrec(2:end)~=mrec(1:end-1))+1; - ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); - """ - rec.insert(0, 0.0) # insert 0.0 at begining of list - rec.append(1.0) # insert 1.0 at end of list - mrec = rec[:] - prec.insert(0, 0.0) # insert 0.0 at begining of list - prec.append(0.0) # insert 0.0 at end of list - mpre = prec[:] - """ - This part makes the precision monotonically decreasing - (goes from the end to the beginning) - matlab: for i=numel(mpre)-1:-1:1 - mpre(i)=max(mpre(i),mpre(i+1)); - """ - for i in range(len(mpre)-2, -1, -1): - mpre[i] = max(mpre[i], mpre[i+1]) - """ - This part creates a list of indexes where the recall changes - matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; - """ - i_list = [] - for i in range(1, len(mrec)): - if mrec[i] != mrec[i-1]: - i_list.append(i) # if it was matlab would be i + 1 - """ - The Average Precision (AP) is the area under the curve - (numerical integration) - matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); - """ - ap = 0.0 - for i in i_list: - ap += ((mrec[i]-mrec[i-1])*mpre[i]) - return ap, mrec, mpre - - -""" - Convert the lines of a file to a list -""" -def file_lines_to_list(path): - # open txt file lines to a list - with open(path) as f: - content = f.readlines() - # remove whitespace characters like `\n` at the end of each line - content = [x.strip() for x in content] - return content - -""" - Draws text in image -""" -def draw_text_in_image(img, text, pos, color, line_width): - font = cv2.FONT_HERSHEY_PLAIN - fontScale = 1 - lineType = 1 - bottomLeftCornerOfText = pos - cv2.putText(img, text, - bottomLeftCornerOfText, - font, - fontScale, - color, - lineType) - text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] - return img, (line_width + text_width) - -""" - Plot - adjust axes -""" -def adjust_axes(r, t, fig, axes): - # get text width for re-scaling - bb = t.get_window_extent(renderer=r) - text_width_inches = bb.width / fig.dpi - # get axis width in inches - current_fig_width = fig.get_figwidth() - new_fig_width = current_fig_width + text_width_inches - propotion = new_fig_width / current_fig_width - # get axis limit - x_lim = axes.get_xlim() - axes.set_xlim([x_lim[0], x_lim[1]*propotion]) - -""" - Draw plot using Matplotlib -""" -def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): - # sort the dictionary by decreasing value, into a list of tuples - sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) - # unpacking the list of tuples into two lists - sorted_keys, sorted_values = zip(*sorted_dic_by_value) - # - if true_p_bar != "": - """ - Special case to draw in: - - green -> TP: True Positives (object detected and matches ground-truth) - - red -> FP: False Positives (object detected but does not match ground-truth) - - orange -> FN: False Negatives (object not detected but present in the ground-truth) - """ - fp_sorted = [] - tp_sorted = [] - for key in sorted_keys: - fp_sorted.append(dictionary[key] - true_p_bar[key]) - tp_sorted.append(true_p_bar[key]) - plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') - plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) - # add legend - plt.legend(loc='lower right') - """ - Write number on side of bar - """ - fig = plt.gcf() # gcf - get current figure - axes = plt.gca() - r = fig.canvas.get_renderer() - for i, val in enumerate(sorted_values): - fp_val = fp_sorted[i] - tp_val = tp_sorted[i] - fp_str_val = " " + str(fp_val) - tp_str_val = fp_str_val + " " + str(tp_val) - # trick to paint multicolor with offset: - # first paint everything and then repaint the first number - t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') - plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') - if i == (len(sorted_values)-1): # largest bar - adjust_axes(r, t, fig, axes) - else: - plt.barh(range(n_classes), sorted_values, color=plot_color) - """ - Write number on side of bar - """ - fig = plt.gcf() # gcf - get current figure - axes = plt.gca() - r = fig.canvas.get_renderer() - for i, val in enumerate(sorted_values): - str_val = " " + str(val) # add a space before - if val < 1.0: - str_val = " {0:.2f}".format(val) - t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') - # re-set axes to show number inside the figure - if i == (len(sorted_values)-1): # largest bar - adjust_axes(r, t, fig, axes) - # set window title - fig.canvas.set_window_title(window_title) - # write classes in y axis - tick_font_size = 12 - plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) - """ - Re-scale height accordingly - """ - init_height = fig.get_figheight() - # comput the matrix height in points and inches - dpi = fig.dpi - height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) - height_in = height_pt / dpi - # compute the required figure height - top_margin = 0.15 # in percentage of the figure height - bottom_margin = 0.05 # in percentage of the figure height - figure_height = height_in / (1 - top_margin - bottom_margin) - # set new height - if figure_height > init_height: - fig.set_figheight(figure_height) - - # set plot title - plt.title(plot_title, fontsize=14) - # set axis titles - # plt.xlabel('classes') - plt.xlabel(x_label, fontsize='large') - # adjust size of window - fig.tight_layout() - # save the plot - fig.savefig(output_path) - # show image - if to_show: - plt.show() - # close the plot - plt.close() - -""" - Create a ".temp_files/" and "results/" directory -""" -TEMP_FILES_PATH = ".temp_files" -if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already - os.makedirs(TEMP_FILES_PATH) -results_files_path = "results" -if os.path.exists(results_files_path): # if it exist already - # reset the results directory - shutil.rmtree(results_files_path) - -os.makedirs(results_files_path) -if draw_plot: - os.makedirs(os.path.join(results_files_path, "AP")) - os.makedirs(os.path.join(results_files_path, "F1")) - os.makedirs(os.path.join(results_files_path, "Recall")) - os.makedirs(os.path.join(results_files_path, "Precision")) -if show_animation: - os.makedirs(os.path.join(results_files_path, "images", "detections_one_by_one")) - -""" - ground-truth - Load each of the ground-truth files into a temporary ".json" file. - Create a list of all the class names present in the ground-truth (gt_classes). -""" -# get a list with the ground-truth files -ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') -if len(ground_truth_files_list) == 0: - error("Error: No ground-truth files found!") -ground_truth_files_list.sort() -# dictionary with counter per class -gt_counter_per_class = {} -counter_images_per_class = {} - -for txt_file in ground_truth_files_list: - #print(txt_file) - file_id = txt_file.split(".txt", 1)[0] - file_id = os.path.basename(os.path.normpath(file_id)) - # check if there is a correspondent detection-results file - temp_path = os.path.join(DR_PATH, (file_id + ".txt")) - if not os.path.exists(temp_path): - error_msg = "Error. File not found: {}\n".format(temp_path) - error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" - error(error_msg) - lines_list = file_lines_to_list(txt_file) - # create ground-truth dictionary - bounding_boxes = [] - is_difficult = False - already_seen_classes = [] - for line in lines_list: - try: - if "difficult" in line: - class_name, left, top, right, bottom, _difficult = line.split() - is_difficult = True - else: - class_name, left, top, right, bottom = line.split() - - except: - if "difficult" in line: - line_split = line.split() - _difficult = line_split[-1] - bottom = line_split[-2] - right = line_split[-3] - top = line_split[-4] - left = line_split[-5] - class_name = "" - for name in line_split[:-5]: - class_name += name + " " - class_name = class_name[:-1] - is_difficult = True - else: - line_split = line.split() - bottom = line_split[-1] - right = line_split[-2] - top = line_split[-3] - left = line_split[-4] - class_name = "" - for name in line_split[:-4]: - class_name += name + " " - class_name = class_name[:-1] - if class_name in args.ignore: - continue - bbox = left + " " + top + " " + right + " " +bottom - if is_difficult: - bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) - is_difficult = False - else: - bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) - if class_name in gt_counter_per_class: - gt_counter_per_class[class_name] += 1 - else: - gt_counter_per_class[class_name] = 1 - - if class_name not in already_seen_classes: - if class_name in counter_images_per_class: - counter_images_per_class[class_name] += 1 - else: - counter_images_per_class[class_name] = 1 - already_seen_classes.append(class_name) - - - with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile: - json.dump(bounding_boxes, outfile) - -gt_classes = list(gt_counter_per_class.keys()) -gt_classes = sorted(gt_classes) -n_classes = len(gt_classes) - -""" - Check format of the flag --set-class-iou (if used) - e.g. check if class exists -""" -if specific_iou_flagged: - n_args = len(args.set_class_iou) - error_msg = \ - '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]' - if n_args % 2 != 0: - error('Error, missing arguments. Flag usage:' + error_msg) - # [class_1] [IoU_1] [class_2] [IoU_2] - # specific_iou_classes = ['class_1', 'class_2'] - specific_iou_classes = args.set_class_iou[::2] # even - # iou_list = ['IoU_1', 'IoU_2'] - iou_list = args.set_class_iou[1::2] # odd - if len(specific_iou_classes) != len(iou_list): - error('Error, missing arguments. Flag usage:' + error_msg) - for tmp_class in specific_iou_classes: - if tmp_class not in gt_classes: - error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg) - for num in iou_list: - if not is_float_between_0_and_1(num): - error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg) - -""" - detection-results - Load each of the detection-results files into a temporary ".json" file. -""" -dr_files_list = glob.glob(DR_PATH + '/*.txt') -dr_files_list.sort() - -for class_index, class_name in enumerate(gt_classes): - bounding_boxes = [] - for txt_file in dr_files_list: - file_id = txt_file.split(".txt",1)[0] - file_id = os.path.basename(os.path.normpath(file_id)) - temp_path = os.path.join(GT_PATH, (file_id + ".txt")) - if class_index == 0: - if not os.path.exists(temp_path): - error_msg = "Error. File not found: {}\n".format(temp_path) - error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" - error(error_msg) - lines = file_lines_to_list(txt_file) - for line in lines: - try: - tmp_class_name, confidence, left, top, right, bottom = line.split() - except: - line_split = line.split() - bottom = line_split[-1] - right = line_split[-2] - top = line_split[-3] - left = line_split[-4] - confidence = line_split[-5] - tmp_class_name = "" - for name in line_split[:-5]: - tmp_class_name += name + " " - tmp_class_name = tmp_class_name[:-1] - - if tmp_class_name == class_name: - bbox = left + " " + top + " " + right + " " +bottom - bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) - - bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) - with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: - json.dump(bounding_boxes, outfile) - -""" - Calculate the AP for each class -""" -sum_AP = 0.0 -ap_dictionary = {} -lamr_dictionary = {} -with open(results_files_path + "/results.txt", 'w') as results_file: - results_file.write("# AP and precision/recall per class\n") - count_true_positives = {} - - for class_index, class_name in enumerate(gt_classes): - count_true_positives[class_name] = 0 - """ - Load detection-results of that class - """ - dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" - dr_data = json.load(open(dr_file)) - """ - Assign detection-results to ground-truth objects - """ - nd = len(dr_data) - tp = [0] * nd - fp = [0] * nd - score = [0] * nd - score05_idx = 0 - for idx, detection in enumerate(dr_data): - file_id = detection["file_id"] - score[idx] = float(detection["confidence"]) - if score[idx] > 0.5: - score05_idx = idx - - if show_animation: - ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") - if len(ground_truth_img) == 0: - error("Error. Image not found with id: " + file_id) - elif len(ground_truth_img) > 1: - error("Error. Multiple image with id: " + file_id) - else: - img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) - img_cumulative_path = results_files_path + "/images/" + ground_truth_img[0] - if os.path.isfile(img_cumulative_path): - img_cumulative = cv2.imread(img_cumulative_path) - else: - img_cumulative = img.copy() - bottom_border = 60 - BLACK = [0, 0, 0] - img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) - - gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" - ground_truth_data = json.load(open(gt_file)) - ovmax = -1 - gt_match = -1 - bb = [ float(x) for x in detection["bbox"].split() ] - for obj in ground_truth_data: - if obj["class_name"] == class_name: - bbgt = [ float(x) for x in obj["bbox"].split() ] - bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] - iw = bi[2] - bi[0] + 1 - ih = bi[3] - bi[1] + 1 - if iw > 0 and ih > 0: - # compute overlap (IoU) = area of intersection / area of union - ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] - + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih - ov = iw * ih / ua - if ov > ovmax: - ovmax = ov - gt_match = obj - - if show_animation: - status = "NO MATCH FOUND!" - min_overlap = MINOVERLAP - if specific_iou_flagged: - if class_name in specific_iou_classes: - index = specific_iou_classes.index(class_name) - min_overlap = float(iou_list[index]) - if ovmax >= min_overlap: - if "difficult" not in gt_match: - if not bool(gt_match["used"]): - tp[idx] = 1 - gt_match["used"] = True - count_true_positives[class_name] += 1 - with open(gt_file, 'w') as f: - f.write(json.dumps(ground_truth_data)) - if show_animation: - status = "MATCH!" - else: - fp[idx] = 1 - if show_animation: - status = "REPEATED MATCH!" - else: - fp[idx] = 1 - if ovmax > 0: - status = "INSUFFICIENT OVERLAP" - - """ - Draw image to show animation - """ - if show_animation: - height, widht = img.shape[:2] - # colors (OpenCV works with BGR) - white = (255,255,255) - light_blue = (255,200,100) - green = (0,255,0) - light_red = (30,30,255) - # 1st line - margin = 10 - v_pos = int(height - margin - (bottom_border / 2.0)) - text = "Image: " + ground_truth_img[0] + " " - img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) - text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " - img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) - if ovmax != -1: - color = light_red - if status == "INSUFFICIENT OVERLAP": - text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) - else: - text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) - color = green - img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) - # 2nd line - v_pos += int(bottom_border / 2.0) - rank_pos = str(idx+1) # rank position (idx starts at 0) - text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) - img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) - color = light_red - if status == "MATCH!": - color = green - text = "Result: " + status + " " - img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) - - font = cv2.FONT_HERSHEY_SIMPLEX - if ovmax > 0: # if there is intersections between the bounding-boxes - bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] - cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) - cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) - cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) - bb = [int(i) for i in bb] - cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) - cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) - cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) - # show image - cv2.imshow("Animation", img) - cv2.waitKey(20) # show for 20 ms - # save image to results - output_img_path = results_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" - cv2.imwrite(output_img_path, img) - # save the image with all the objects drawn to it - cv2.imwrite(img_cumulative_path, img_cumulative) - - cumsum = 0 - for idx, val in enumerate(fp): - fp[idx] += cumsum - cumsum += val - - cumsum = 0 - for idx, val in enumerate(tp): - tp[idx] += cumsum - cumsum += val - - rec = tp[:] - for idx, val in enumerate(tp): - rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1) - - prec = tp[:] - for idx, val in enumerate(tp): - prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1) - - ap, mrec, mprec = voc_ap(rec[:], prec[:]) - F1 = np.array(rec)*np.array(prec)*2 / np.where((np.array(prec)+np.array(rec))==0, 1, (np.array(prec)+np.array(rec))) - - sum_AP += ap - text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) - - if len(prec)>0: - F1_text = "{0:.2f}".format(F1[score05_idx]) + " = " + class_name + " F1 " - Recall_text = "{0:.2f}%".format(rec[score05_idx]*100) + " = " + class_name + " Recall " - Precision_text = "{0:.2f}%".format(prec[score05_idx]*100) + " = " + class_name + " Precision " - else: - F1_text = "0.00" + " = " + class_name + " F1 " - Recall_text = "0.00%" + " = " + class_name + " Recall " - Precision_text = "0.00%" + " = " + class_name + " Precision " - - rounded_prec = [ '%.2f' % elem for elem in prec ] - rounded_rec = [ '%.2f' % elem for elem in rec ] - results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") - if not args.quiet: - if len(prec)>0: - print(text + "\t||\tscore_threhold=0.5 : " + "F1=" + "{0:.2f}".format(F1[score05_idx])\ - + " ; Recall=" + "{0:.2f}%".format(rec[score05_idx]*100) + " ; Precision=" + "{0:.2f}%".format(prec[score05_idx]*100)) - else: - print(text + "\t||\tscore_threhold=0.5 : F1=0.00% ; Recall=0.00% ; Precision=0.00%") - ap_dictionary[class_name] = ap - - n_images = counter_images_per_class[class_name] - lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images) - lamr_dictionary[class_name] = lamr - - """ - Draw plot - """ - if draw_plot: - plt.plot(rec, prec, '-o') - area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] - area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] - plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') - - fig = plt.gcf() - fig.canvas.set_window_title('AP ' + class_name) - - plt.title('class: ' + text) - plt.xlabel('Recall') - plt.ylabel('Precision') - axes = plt.gca() - axes.set_xlim([0.0,1.0]) - axes.set_ylim([0.0,1.05]) - fig.savefig(results_files_path + "/AP/" + class_name + ".png") - plt.cla() - - plt.plot(score, F1, "-", color='orangered') - plt.title('class: ' + F1_text + "\nscore_threhold=0.5") - plt.xlabel('Score_Threhold') - plt.ylabel('F1') - axes = plt.gca() - axes.set_xlim([0.0,1.0]) - axes.set_ylim([0.0,1.05]) - fig.savefig(results_files_path + "/F1/" + class_name + ".png") - plt.cla() - - plt.plot(score, rec, "-H", color='gold') - plt.title('class: ' + Recall_text + "\nscore_threhold=0.5") - plt.xlabel('Score_Threhold') - plt.ylabel('Recall') - axes = plt.gca() - axes.set_xlim([0.0,1.0]) - axes.set_ylim([0.0,1.05]) - fig.savefig(results_files_path + "/Recall/" + class_name + ".png") - plt.cla() - - plt.plot(score, prec, "-s", color='palevioletred') - plt.title('class: ' + Precision_text + "\nscore_threhold=0.5") - plt.xlabel('Score_Threhold') - plt.ylabel('Precision') - axes = plt.gca() - axes.set_xlim([0.0,1.0]) - axes.set_ylim([0.0,1.05]) - fig.savefig(results_files_path + "/Precision/" + class_name + ".png") - plt.cla() - - if show_animation: - cv2.destroyAllWindows() - - results_file.write("\n# mAP of all classes\n") - mAP = sum_AP / n_classes - text = "mAP = {0:.2f}%".format(mAP*100) - results_file.write(text + "\n") - print(text) - -# remove the temp_files directory -shutil.rmtree(TEMP_FILES_PATH) - -""" - Count total of detection-results -""" -# iterate through all the files -det_counter_per_class = {} -for txt_file in dr_files_list: - # get lines to list - lines_list = file_lines_to_list(txt_file) - for line in lines_list: - class_name = line.split()[0] - # check if class is in the ignore list, if yes skip - if class_name in args.ignore: - continue - # count that object - if class_name in det_counter_per_class: - det_counter_per_class[class_name] += 1 - else: - # if class didn't exist yet - det_counter_per_class[class_name] = 1 -#print(det_counter_per_class) -dr_classes = list(det_counter_per_class.keys()) - - -""" - Plot the total number of occurences of each class in the ground-truth -""" -if draw_plot: - window_title = "ground-truth-info" - plot_title = "ground-truth\n" - plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" - x_label = "Number of objects per class" - output_path = results_files_path + "/ground-truth-info.png" - to_show = False - plot_color = 'forestgreen' - draw_plot_func( - gt_counter_per_class, - n_classes, - window_title, - plot_title, - x_label, - output_path, - to_show, - plot_color, - '', - ) - -""" - Write number of ground-truth objects per class to results.txt -""" -with open(results_files_path + "/results.txt", 'a') as results_file: - results_file.write("\n# Number of ground-truth objects per class\n") - for class_name in sorted(gt_counter_per_class): - results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") - -""" - Finish counting true positives -""" -for class_name in dr_classes: - # if class exists in detection-result but not in ground-truth then there are no true positives in that class - if class_name not in gt_classes: - count_true_positives[class_name] = 0 -#print(count_true_positives) - -""" - Plot the total number of occurences of each class in the "detection-results" folder -""" -if draw_plot: - window_title = "detection-results-info" - # Plot title - plot_title = "detection-results\n" - plot_title += "(" + str(len(dr_files_list)) + " files and " - count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) - plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" - # end Plot title - x_label = "Number of objects per class" - output_path = results_files_path + "/detection-results-info.png" - to_show = False - plot_color = 'forestgreen' - true_p_bar = count_true_positives - draw_plot_func( - det_counter_per_class, - len(det_counter_per_class), - window_title, - plot_title, - x_label, - output_path, - to_show, - plot_color, - true_p_bar - ) - -""" - Write number of detected objects per class to results.txt -""" -with open(results_files_path + "/results.txt", 'a') as results_file: - results_file.write("\n# Number of detected objects per class\n") - for class_name in sorted(dr_classes): - n_det = det_counter_per_class[class_name] - text = class_name + ": " + str(n_det) - text += " (tp:" + str(count_true_positives[class_name]) + "" - text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" - results_file.write(text) - -""" - Draw log-average miss rate plot (Show lamr of all classes in decreasing order) -""" -if draw_plot: - window_title = "lamr" - plot_title = "log-average miss rate" - x_label = "log-average miss rate" - output_path = results_files_path + "/lamr.png" - to_show = False - plot_color = 'royalblue' - draw_plot_func( - lamr_dictionary, - n_classes, - window_title, - plot_title, - x_label, - output_path, - to_show, - plot_color, - "" - ) - -""" - Draw mAP plot (Show AP's of all classes in decreasing order) -""" -if draw_plot: - window_title = "mAP" - plot_title = "mAP = {0:.2f}%".format(mAP*100) - x_label = "Average Precision" - output_path = results_files_path + "/mAP.png" - to_show = True - plot_color = 'royalblue' - draw_plot_func( - ap_dictionary, - n_classes, - window_title, - plot_title, - x_label, - output_path, - to_show, - plot_color, - "" - ) +import os +import xml.etree.ElementTree as ET + +from PIL import Image +from tqdm import tqdm + +from utils.utils import get_classes +from utils.utils_map import get_coco_map, get_map +from yolo import YOLO + +if __name__ == "__main__": + ''' + Recall和Precision不像AP是一个面积的概念,在门限值不同时,网络的Recall和Precision值是不同的。 + map计算结果中的Recall和Precision代表的是当预测时,门限置信度为0.5时,所对应的Recall和Precision值。 + + 此处获得的./map_out/detection-results/里面的txt的框的数量会比直接predict多一些,这是因为这里的门限低, + 目的是为了计算不同门限条件下的Recall和Precision值,从而实现map的计算。 + ''' + #------------------------------------------------------------------------------------------------------------------# + # map_mode用于指定该文件运行时计算的内容 + # map_mode为0代表整个map计算流程,包括获得预测结果、获得真实框、计算VOC_map。 + # map_mode为1代表仅仅获得预测结果。 + # map_mode为2代表仅仅获得真实框。 + # map_mode为3代表仅仅计算VOC_map。 + # map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行 + #-------------------------------------------------------------------------------------------------------------------# + map_mode = 0 + #-------------------------------------------------------# + # 此处的classes_path用于指定需要测量VOC_map的类别 + # 一般情况下与训练和预测所用的classes_path一致即可 + #-------------------------------------------------------# + classes_path = 'model_data/voc_classes.txt' + #-------------------------------------------------------# + # MINOVERLAP用于指定想要获得的mAP0.x + # 比如计算mAP0.75,可以设定MINOVERLAP = 0.75。 + #-------------------------------------------------------# + MINOVERLAP = 0.5 + #-------------------------------------------------------# + # map_vis用于指定是否开启VOC_map计算的可视化 + #-------------------------------------------------------# + map_vis = False + #-------------------------------------------------------# + # 指向VOC数据集所在的文件夹 + # 默认指向根目录下的VOC数据集 + #-------------------------------------------------------# + VOCdevkit_path = 'VOCdevkit' + #-------------------------------------------------------# + # 结果输出的文件夹,默认为map_out + #-------------------------------------------------------# + map_out_path = 'map_out' + + image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split() + + if not os.path.exists(map_out_path): + os.makedirs(map_out_path) + if not os.path.exists(os.path.join(map_out_path, 'ground-truth')): + os.makedirs(os.path.join(map_out_path, 'ground-truth')) + if not os.path.exists(os.path.join(map_out_path, 'detection-results')): + os.makedirs(os.path.join(map_out_path, 'detection-results')) + if not os.path.exists(os.path.join(map_out_path, 'images-optional')): + os.makedirs(os.path.join(map_out_path, 'images-optional')) + + class_names, _ = get_classes(classes_path) + + if map_mode == 0 or map_mode == 1: + print("Load model.") + yolo = YOLO(confidence = 0.001, nms_iou = 0.5) + print("Load model done.") + + print("Get predict result.") + for image_id in tqdm(image_ids): + image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/"+image_id+".jpg") + image = Image.open(image_path) + if map_vis: + image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg")) + yolo.get_map_txt(image_id, image, class_names, map_out_path) + print("Get predict result done.") + + if map_mode == 0 or map_mode == 2: + print("Get ground truth result.") + for image_id in tqdm(image_ids): + with open(os.path.join(map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f: + root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/"+image_id+".xml")).getroot() + for obj in root.findall('object'): + difficult_flag = False + if obj.find('difficult')!=None: + difficult = obj.find('difficult').text + if int(difficult)==1: + difficult_flag = True + obj_name = obj.find('name').text + if obj_name not in class_names: + continue + bndbox = obj.find('bndbox') + left = bndbox.find('xmin').text + top = bndbox.find('ymin').text + right = bndbox.find('xmax').text + bottom = bndbox.find('ymax').text + + if difficult_flag: + new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom)) + else: + new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom)) + print("Get ground truth result done.") + + if map_mode == 0 or map_mode == 3: + print("Get map.") + get_map(MINOVERLAP, True, path = map_out_path) + print("Get map done.") + + if map_mode == 4: + print("Get map.") + get_coco_map(class_names = class_names, path = map_out_path) + print("Get map done.") diff --git a/kmeans_for_anchors.py b/kmeans_for_anchors.py index 999f04f3f6ad98e1e32af43ac40cebc03514497a..b9c0dfa34f0fa345f60aa628fc0f2fb8613c0dda 100644 --- a/kmeans_for_anchors.py +++ b/kmeans_for_anchors.py @@ -24,32 +24,45 @@ def avg_iou(box,cluster): return np.mean([np.max(cas_iou(box[i],cluster)) for i in range(box.shape[0])]) def kmeans(box,k): - # 取出一共有多少框 + #-------------------------------------------------------------# + # 取出一共有多少框 + #-------------------------------------------------------------# row = box.shape[0] - # 每个框各个点的位置 + #-------------------------------------------------------------# + # 每个框各个点的位置 + #-------------------------------------------------------------# distance = np.empty((row,k)) - # 最后的聚类位置 + #-------------------------------------------------------------# + # 最后的聚类位置 + #-------------------------------------------------------------# last_clu = np.zeros((row,)) np.random.seed() - # 随机选5个当聚类中心 + #-------------------------------------------------------------# + # 随机选5个当聚类中心 + #-------------------------------------------------------------# cluster = box[np.random.choice(row,k,replace = False)] - # cluster = random.sample(row, k) while True: - # 计算每一行距离五个点的iou情况。 + #-------------------------------------------------------------# + # 计算每一行距离五个点的iou情况。 + #-------------------------------------------------------------# for i in range(row): distance[i] = 1 - cas_iou(box[i],cluster) - # 取出最小点 + #-------------------------------------------------------------# + # 取出最小点 + #-------------------------------------------------------------# near = np.argmin(distance,axis=1) if (last_clu == near).all(): break - # 求每一个类的中位点 + #-------------------------------------------------------------# + # 求每一个类的中位点 + #-------------------------------------------------------------# for j in range(k): cluster[j] = np.median( box[near == j],axis=0) @@ -60,7 +73,9 @@ def kmeans(box,k): def load_data(path): data = [] - # 对于每一个xml都寻找box + #-------------------------------------------------------------# + # 对于每一个xml都寻找box + #-------------------------------------------------------------# for xml_file in glob.glob('{}/*xml'.format(path)): tree = ET.parse(xml_file) height = int(tree.findtext('./size/height')) @@ -68,7 +83,9 @@ def load_data(path): if height<=0 or width<=0: continue - # 对于每一个目标都获得它的宽高 + #-------------------------------------------------------------# + # 对于每一个目标都获得它的宽高 + #-------------------------------------------------------------# for obj in tree.iter('object'): xmin = int(float(obj.findtext('bndbox/xmin'))) / width ymin = int(float(obj.findtext('bndbox/ymin'))) / height @@ -85,18 +102,26 @@ def load_data(path): if __name__ == '__main__': - # 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml - # 会生成yolo_anchors.txt - SIZE = 416 + #-------------------------------------------------------------# + # 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml + # 会生成yolo_anchors.txt + #-------------------------------------------------------------# + SIZE = 416 anchors_num = 9 - # 载入数据集,可以使用VOC的xml - path = r'./VOCdevkit/VOC2007/Annotations' + #-------------------------------------------------------------# + # 载入数据集,可以使用VOC的xml + #-------------------------------------------------------------# + path = r'./VOCdevkit/VOC2007/Annotations' - # 载入所有的xml - # 存储格式为转化为比例后的width,height + #-------------------------------------------------------------# + # 载入所有的xml + # 存储格式为转化为比例后的width,height + #-------------------------------------------------------------# data = load_data(path) - # 使用k聚类算法 + #-------------------------------------------------------------# + # 使用k聚类算法 + #-------------------------------------------------------------# out = kmeans(data,anchors_num) out = out[np.argsort(out[:,0])] print('acc:{:.2f}%'.format(avg_iou(data,out) * 100)) diff --git a/nets/CSPdarknet.py b/nets/CSPdarknet.py index b6f7965bfc6f7a17130f6976e41c7c3af14600fc..d339322e72f0fa47782cd86a0b1b3910d1405f4c 100644 --- a/nets/CSPdarknet.py +++ b/nets/CSPdarknet.py @@ -1,4 +1,5 @@ import math +from collections import OrderedDict import torch import torch.nn as nn diff --git a/nets/__init__.py b/nets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4287ca8617970fa8fc025b75cb319c7032706910 --- /dev/null +++ b/nets/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/nets/yolo.py b/nets/yolo.py new file mode 100644 index 0000000000000000000000000000000000000000..11a12734fbb978fde6d8720e0631d641c900c47c --- /dev/null +++ b/nets/yolo.py @@ -0,0 +1,185 @@ +from collections import OrderedDict + +import torch +import torch.nn as nn + +from nets.CSPdarknet import darknet53 + + +def conv2d(filter_in, filter_out, kernel_size, stride=1): + pad = (kernel_size - 1) // 2 if kernel_size else 0 + return nn.Sequential(OrderedDict([ + ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad, bias=False)), + ("bn", nn.BatchNorm2d(filter_out)), + ("relu", nn.LeakyReLU(0.1)), + ])) + +#---------------------------------------------------# +# SPP结构,利用不同大小的池化核进行池化 +# 池化后堆叠 +#---------------------------------------------------# +class SpatialPyramidPooling(nn.Module): + def __init__(self, pool_sizes=[5, 9, 13]): + super(SpatialPyramidPooling, self).__init__() + + self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, pool_size//2) for pool_size in pool_sizes]) + + def forward(self, x): + features = [maxpool(x) for maxpool in self.maxpools[::-1]] + features = torch.cat(features + [x], dim=1) + + return features + +#---------------------------------------------------# +# 卷积 + 上采样 +#---------------------------------------------------# +class Upsample(nn.Module): + def __init__(self, in_channels, out_channels): + super(Upsample, self).__init__() + + self.upsample = nn.Sequential( + conv2d(in_channels, out_channels, 1), + nn.Upsample(scale_factor=2, mode='nearest') + ) + + def forward(self, x,): + x = self.upsample(x) + return x + +#---------------------------------------------------# +# 三次卷积块 +#---------------------------------------------------# +def make_three_conv(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + ) + return m + +#---------------------------------------------------# +# 五次卷积块 +#---------------------------------------------------# +def make_five_conv(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + conv2d(filters_list[0], filters_list[1], 3), + conv2d(filters_list[1], filters_list[0], 1), + ) + return m + +#---------------------------------------------------# +# 最后获得yolov4的输出 +#---------------------------------------------------# +def yolo_head(filters_list, in_filters): + m = nn.Sequential( + conv2d(in_filters, filters_list[0], 3), + nn.Conv2d(filters_list[0], filters_list[1], 1), + ) + return m + +#---------------------------------------------------# +# yolo_body +#---------------------------------------------------# +class YoloBody(nn.Module): + def __init__(self, anchors_mask, num_classes): + super(YoloBody, self).__init__() + #---------------------------------------------------# + # 生成CSPdarknet53的主干模型 + # 获得三个有效特征层,他们的shape分别是: + # 52,52,256 + # 26,26,512 + # 13,13,1024 + #---------------------------------------------------# + self.backbone = darknet53(None) + + self.conv1 = make_three_conv([512,1024],1024) + self.SPP = SpatialPyramidPooling() + self.conv2 = make_three_conv([512,1024],2048) + + self.upsample1 = Upsample(512,256) + self.conv_for_P4 = conv2d(512,256,1) + self.make_five_conv1 = make_five_conv([256, 512],512) + + self.upsample2 = Upsample(256,128) + self.conv_for_P3 = conv2d(256,128,1) + self.make_five_conv2 = make_five_conv([128, 256],256) + + # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75 + self.yolo_head3 = yolo_head([256, len(anchors_mask[0]) * (5 + num_classes)],128) + + self.down_sample1 = conv2d(128,256,3,stride=2) + self.make_five_conv3 = make_five_conv([256, 512],512) + + # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75 + self.yolo_head2 = yolo_head([512, len(anchors_mask[1]) * (5 + num_classes)],256) + + self.down_sample2 = conv2d(256,512,3,stride=2) + self.make_five_conv4 = make_five_conv([512, 1024],1024) + + # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75 + self.yolo_head1 = yolo_head([1024, len(anchors_mask[2]) * (5 + num_classes)],512) + + + def forward(self, x): + # backbone + x2, x1, x0 = self.backbone(x) + + # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,2048 + P5 = self.conv1(x0) + P5 = self.SPP(P5) + # 13,13,2048 -> 13,13,512 -> 13,13,1024 -> 13,13,512 + P5 = self.conv2(P5) + + # 13,13,512 -> 13,13,256 -> 26,26,256 + P5_upsample = self.upsample1(P5) + # 26,26,512 -> 26,26,256 + P4 = self.conv_for_P4(x1) + # 26,26,256 + 26,26,256 -> 26,26,512 + P4 = torch.cat([P4,P5_upsample],axis=1) + # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 + P4 = self.make_five_conv1(P4) + + # 26,26,256 -> 26,26,128 -> 52,52,128 + P4_upsample = self.upsample2(P4) + # 52,52,256 -> 52,52,128 + P3 = self.conv_for_P3(x2) + # 52,52,128 + 52,52,128 -> 52,52,256 + P3 = torch.cat([P3,P4_upsample],axis=1) + # 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 + P3 = self.make_five_conv2(P3) + + # 52,52,128 -> 26,26,256 + P3_downsample = self.down_sample1(P3) + # 26,26,256 + 26,26,256 -> 26,26,512 + P4 = torch.cat([P3_downsample,P4],axis=1) + # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 + P4 = self.make_five_conv3(P4) + + # 26,26,256 -> 13,13,512 + P4_downsample = self.down_sample2(P4) + # 13,13,512 + 13,13,512 -> 13,13,1024 + P5 = torch.cat([P4_downsample,P5],axis=1) + # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 + P5 = self.make_five_conv4(P5) + + #---------------------------------------------------# + # 第三个特征层 + # y3=(batch_size,75,52,52) + #---------------------------------------------------# + out2 = self.yolo_head3(P3) + #---------------------------------------------------# + # 第二个特征层 + # y2=(batch_size,75,26,26) + #---------------------------------------------------# + out1 = self.yolo_head2(P4) + #---------------------------------------------------# + # 第一个特征层 + # y1=(batch_size,75,13,13) + #---------------------------------------------------# + out0 = self.yolo_head1(P5) + + return out0, out1, out2 + diff --git a/nets/yolo_training.py b/nets/yolo_training.py index e64a79700de085eb9aa703fa6c0050890a4309d3..cb0ba7d3ecd5de75fcf464eac32e7012691e98ca 100644 --- a/nets/yolo_training.py +++ b/nets/yolo_training.py @@ -1,431 +1,400 @@ -import os - -import math -import numpy as np -import scipy.signal import torch import torch.nn as nn -from matplotlib import pyplot as plt - -def jaccard(_box_a, _box_b): - b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2 - b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2 - b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2 - b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2 - box_a = torch.zeros_like(_box_a) - box_b = torch.zeros_like(_box_b) - box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2 - box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2 - A = box_a.size(0) - B = box_b.size(0) - max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), - box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) - min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), - box_b[:, :2].unsqueeze(0).expand(A, B, 2)) - inter = torch.clamp((max_xy - min_xy), min=0) - - inter = inter[:, :, 0] * inter[:, :, 1] - # 计算先验框和真实框各自的面积 - area_a = ((box_a[:, 2]-box_a[:, 0]) * - (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] - area_b = ((box_b[:, 2]-box_b[:, 0]) * - (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] - # 求IOU - union = area_a + area_b - inter - return inter / union # [A,B] - -#---------------------------------------------------# -# 平滑标签 -#---------------------------------------------------# -def smooth_labels(y_true, label_smoothing,num_classes): - return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes - -def box_ciou(b1, b2): - """ - 输入为: - ---------- - b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh - b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh - - 返回为: - ------- - ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1) - """ - # 求出预测框左上角右下角 - b1_xy = b1[..., :2] - b1_wh = b1[..., 2:4] - b1_wh_half = b1_wh/2. - b1_mins = b1_xy - b1_wh_half - b1_maxes = b1_xy + b1_wh_half - # 求出真实框左上角右下角 - b2_xy = b2[..., :2] - b2_wh = b2[..., 2:4] - b2_wh_half = b2_wh/2. - b2_mins = b2_xy - b2_wh_half - b2_maxes = b2_xy + b2_wh_half - - # 求真实框和预测框所有的iou - intersect_mins = torch.max(b1_mins, b2_mins) - intersect_maxes = torch.min(b1_maxes, b2_maxes) - intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes)) - intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] - b1_area = b1_wh[..., 0] * b1_wh[..., 1] - b2_area = b2_wh[..., 0] * b2_wh[..., 1] - union_area = b1_area + b2_area - intersect_area - iou = intersect_area / torch.clamp(union_area,min = 1e-6) +import math +import numpy as np - # 计算中心的差距 - center_distance = torch.sum(torch.pow((b1_xy - b2_xy), 2), axis=-1) - - # 找到包裹两个框的最小框的左上角和右下角 - enclose_mins = torch.min(b1_mins, b2_mins) - enclose_maxes = torch.max(b1_maxes, b2_maxes) - enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes)) - # 计算对角线距离 - enclose_diagonal = torch.sum(torch.pow(enclose_wh,2), axis=-1) - ciou = iou - 1.0 * (center_distance) / torch.clamp(enclose_diagonal,min = 1e-6) - - v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0]/torch.clamp(b1_wh[..., 1],min = 1e-6)) - torch.atan(b2_wh[..., 0]/torch.clamp(b2_wh[..., 1],min = 1e-6))), 2) - alpha = v / torch.clamp((1.0 - iou + v),min=1e-6) - ciou = ciou - alpha * v - return ciou - -def clip_by_tensor(t,t_min,t_max): - t=t.float() - result = (t >= t_min).float() * t + (t < t_min).float() * t_min - result = (result <= t_max).float() * result + (result > t_max).float() * t_max - return result +class YOLOLoss(nn.Module): + def __init__(self, anchors, num_classes, input_shape, cuda, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]], label_smoothing = 0): + super(YOLOLoss, self).__init__() + #-----------------------------------------------------------# + # 13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401] + # 26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146] + # 52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28] + #-----------------------------------------------------------# + self.anchors = anchors + self.num_classes = num_classes + self.bbox_attrs = 5 + num_classes + self.input_shape = input_shape + self.anchors_mask = anchors_mask + self.label_smoothing = label_smoothing + + self.ignore_threshold = 0.7 + self.cuda = cuda -def MSELoss(pred,target): - return (pred-target)**2 + def clip_by_tensor(self, t, t_min, t_max): + t = t.float() + result = (t >= t_min).float() * t + (t < t_min).float() * t_min + result = (result <= t_max).float() * result + (result > t_max).float() * t_max + return result -def BCELoss(pred,target): - epsilon = 1e-7 - pred = clip_by_tensor(pred, epsilon, 1.0 - epsilon) - output = -target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred) - return output + def MSELoss(self, pred, target): + return torch.pow(pred - target, 2) -class YOLOLoss(nn.Module): - def __init__(self, anchors, num_classes, img_size, label_smooth=0, cuda=True, normalize=True): - super(YOLOLoss, self).__init__() - self.anchors = anchors - self.num_anchors = len(anchors) - self.num_classes = num_classes - self.bbox_attrs = 5 + num_classes - self.img_size = img_size - self.feature_length = [img_size[0]//32,img_size[0]//16,img_size[0]//8] - self.label_smooth = label_smooth + def BCELoss(self, pred, target): + epsilon = 1e-7 + pred = self.clip_by_tensor(pred, epsilon, 1.0 - epsilon) + output = - target * torch.log(pred) - (1.0 - target) * torch.log(1.0 - pred) + return output + + def box_ciou(self, b1, b2): + """ + 输入为: + ---------- + b1: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + b2: tensor, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh + + 返回为: + ------- + ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1) + """ + #----------------------------------------------------# + # 求出预测框左上角右下角 + #----------------------------------------------------# + b1_xy = b1[..., :2] + b1_wh = b1[..., 2:4] + b1_wh_half = b1_wh/2. + b1_mins = b1_xy - b1_wh_half + b1_maxes = b1_xy + b1_wh_half + #----------------------------------------------------# + # 求出真实框左上角右下角 + #----------------------------------------------------# + b2_xy = b2[..., :2] + b2_wh = b2[..., 2:4] + b2_wh_half = b2_wh/2. + b2_mins = b2_xy - b2_wh_half + b2_maxes = b2_xy + b2_wh_half - self.ignore_threshold = 0.5 - self.lambda_conf = 1.0 - self.lambda_cls = 1.0 - self.lambda_loc = 1.0 - self.cuda = cuda - self.normalize = normalize + #----------------------------------------------------# + # 求真实框和预测框所有的iou + #----------------------------------------------------# + intersect_mins = torch.max(b1_mins, b2_mins) + intersect_maxes = torch.min(b1_maxes, b2_maxes) + intersect_wh = torch.max(intersect_maxes - intersect_mins, torch.zeros_like(intersect_maxes)) + intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] + b1_area = b1_wh[..., 0] * b1_wh[..., 1] + b2_area = b2_wh[..., 0] * b2_wh[..., 1] + union_area = b1_area + b2_area - intersect_area + iou = intersect_area / torch.clamp(union_area,min = 1e-6) - def forward(self, input, targets=None): #----------------------------------------------------# + # 计算中心的差距 + #----------------------------------------------------# + center_distance = torch.sum(torch.pow((b1_xy - b2_xy), 2), axis=-1) + + #----------------------------------------------------# + # 找到包裹两个框的最小框的左上角和右下角 + #----------------------------------------------------# + enclose_mins = torch.min(b1_mins, b2_mins) + enclose_maxes = torch.max(b1_maxes, b2_maxes) + enclose_wh = torch.max(enclose_maxes - enclose_mins, torch.zeros_like(intersect_maxes)) + #----------------------------------------------------# + # 计算对角线距离 + #----------------------------------------------------# + enclose_diagonal = torch.sum(torch.pow(enclose_wh,2), axis=-1) + ciou = iou - 1.0 * (center_distance) / torch.clamp(enclose_diagonal,min = 1e-6) + + v = (4 / (math.pi ** 2)) * torch.pow((torch.atan(b1_wh[..., 0] / torch.clamp(b1_wh[..., 1],min = 1e-6)) - torch.atan(b2_wh[..., 0] / torch.clamp(b2_wh[..., 1], min = 1e-6))), 2) + alpha = v / torch.clamp((1.0 - iou + v), min=1e-6) + ciou = ciou - alpha * v + return ciou + + #---------------------------------------------------# + # 平滑标签 + #---------------------------------------------------# + def smooth_labels(self, y_true, label_smoothing, num_classes): + return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes + + def forward(self, l, input, targets=None): + #----------------------------------------------------# + # l 代表使用的是第几个有效特征层 # input的shape为 bs, 3*(5+num_classes), 13, 13 # bs, 3*(5+num_classes), 26, 26 # bs, 3*(5+num_classes), 52, 52 + # targets 真实框的标签情况 [batch_size, num_gt, 5] #----------------------------------------------------# - - #-----------------------# - # 一共多少张图片 - #-----------------------# - bs = input.size(0) - #-----------------------# - # 特征层的高 - #-----------------------# - in_h = input.size(2) - #-----------------------# - # 特征层的宽 - #-----------------------# - in_w = input.size(3) - + #--------------------------------# + # 获得图片数量,特征层的高和宽 + #--------------------------------# + bs = input.size(0) + in_h = input.size(2) + in_w = input.size(3) #-----------------------------------------------------------------------# # 计算步长 # 每一个特征点对应原来的图片上多少个像素点 + # # 如果特征层为13x13的话,一个特征点就对应原来的图片上的32个像素点 # 如果特征层为26x26的话,一个特征点就对应原来的图片上的16个像素点 # 如果特征层为52x52的话,一个特征点就对应原来的图片上的8个像素点 # stride_h = stride_w = 32、16、8 #-----------------------------------------------------------------------# - stride_h = self.img_size[1] / in_h - stride_w = self.img_size[0] / in_w - - + stride_h = self.input_shape[0] / in_h + stride_w = self.input_shape[1] / in_w #-------------------------------------------------# # 此时获得的scaled_anchors大小是相对于特征层的 #-------------------------------------------------# - scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors] - + scaled_anchors = [(a_w / stride_w, a_h / stride_h) for a_w, a_h in self.anchors] #-----------------------------------------------# # 输入的input一共有三个,他们的shape分别是 + # bs, 3 * (5+num_classes), 13, 13 => bs, 3, 5 + num_classes, 13, 13 => batch_size, 3, 13, 13, 5 + num_classes + # batch_size, 3, 13, 13, 5 + num_classes # batch_size, 3, 26, 26, 5 + num_classes # batch_size, 3, 52, 52, 5 + num_classes #-----------------------------------------------# - prediction = input.view(bs, int(self.num_anchors/3), - self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous() + prediction = input.view(bs, len(self.anchors_mask[l]), self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous() - # 获得置信度,是否有物体 + #-----------------------------------------------# + # 先验框的中心位置的调整参数 + #-----------------------------------------------# + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + #-----------------------------------------------# + # 先验框的宽高调整参数 + #-----------------------------------------------# + w = prediction[..., 2] + h = prediction[..., 3] + #-----------------------------------------------# + # 获得置信度,是否有物体 + #-----------------------------------------------# conf = torch.sigmoid(prediction[..., 4]) - # 种类置信度 + #-----------------------------------------------# + # 种类置信度 + #-----------------------------------------------# pred_cls = torch.sigmoid(prediction[..., 5:]) - #---------------------------------------------------------------# - # 找到哪些先验框内部包含物体 - # 利用真实框和先验框计算交并比 - # mask batch_size, 3, in_h, in_w 有目标的特征点 - # noobj_mask batch_size, 3, in_h, in_w 无目标的特征点 - # t_box batch_size, 3, in_h, in_w, 4 中心宽高的真实值 - # tconf batch_size, 3, in_h, in_w 置信度真实值 - # tcls batch_size, 3, in_h, in_w, num_classes 种类真实值 - #----------------------------------------------------------------# - mask, noobj_mask, t_box, tconf, tcls, box_loss_scale_x, box_loss_scale_y = self.get_target(targets, scaled_anchors,in_w, in_h,self.ignore_threshold) + #-----------------------------------------------# + # 获得网络应该有的预测结果 + #-----------------------------------------------# + y_true, noobj_mask, box_loss_scale = self.get_target(l, targets, scaled_anchors, in_h, in_w) #---------------------------------------------------------------# # 将预测结果进行解码,判断预测结果和真实值的重合程度 # 如果重合程度过大则忽略,因为这些特征点属于预测比较准确的特征点 # 作为负样本不合适 #----------------------------------------------------------------# - noobj_mask, pred_boxes_for_ciou = self.get_ignore(prediction, targets, scaled_anchors, in_w, in_h, noobj_mask) + noobj_mask, pred_boxes = self.get_ignore(l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask) if self.cuda: - mask, noobj_mask = mask.cuda(), noobj_mask.cuda() - box_loss_scale_x, box_loss_scale_y= box_loss_scale_x.cuda(), box_loss_scale_y.cuda() - tconf, tcls = tconf.cuda(), tcls.cuda() - pred_boxes_for_ciou = pred_boxes_for_ciou.cuda() - t_box = t_box.cuda() + y_true = y_true.cuda() + noobj_mask = noobj_mask.cuda() + box_loss_scale = box_loss_scale.cuda() + #-----------------------------------------------------------# + # reshape_y_true[...,2:3]和reshape_y_true[...,3:4] + # 表示真实框的宽高,二者均在0-1之间 + # 真实框越大,比重越小,小框的比重更大。 + #-----------------------------------------------------------# + box_loss_scale = 2 - box_loss_scale - box_loss_scale = 2 - box_loss_scale_x * box_loss_scale_y #---------------------------------------------------------------# # 计算预测结果和真实结果的CIOU #----------------------------------------------------------------# - ciou = (1 - box_ciou( pred_boxes_for_ciou[mask.bool()], t_box[mask.bool()]))* box_loss_scale[mask.bool()] - loss_loc = torch.sum(ciou) - - # 计算置信度的loss - loss_conf = torch.sum(BCELoss(conf, mask) * mask) + \ - torch.sum(BCELoss(conf, mask) * noobj_mask) - - loss_cls = torch.sum(BCELoss(pred_cls[mask == 1], smooth_labels(tcls[mask == 1],self.label_smooth,self.num_classes))) - - loss = loss_conf * self.lambda_conf + loss_cls * self.lambda_cls + loss_loc * self.lambda_loc - - if self.normalize: - num_pos = torch.sum(mask) - num_pos = torch.max(num_pos, torch.ones_like(num_pos)) - else: - num_pos = bs/3 - + ciou = (1 - self.box_ciou(pred_boxes[y_true[..., 4] == 1], y_true[..., :4][y_true[..., 4] == 1])) * box_loss_scale[y_true[..., 4] == 1] + loss_loc = torch.sum(ciou) + #-----------------------------------------------------------# + # 计算置信度的loss + #-----------------------------------------------------------# + loss_conf = torch.sum(self.BCELoss(conf, y_true[..., 4]) * y_true[..., 4]) + \ + torch.sum(self.BCELoss(conf, y_true[..., 4]) * noobj_mask) + + loss_cls = torch.sum(self.BCELoss(pred_cls[y_true[..., 4] == 1], self.smooth_labels(y_true[..., 5:][y_true[..., 4] == 1], self.label_smoothing, self.num_classes))) + + loss = loss_loc + loss_conf + loss_cls + num_pos = torch.sum(y_true[..., 4]) + num_pos = torch.max(num_pos, torch.ones_like(num_pos)) return loss, num_pos - def get_target(self, target, anchors, in_w, in_h, ignore_threshold): + def calculate_iou(self, _box_a, _box_b): + #-----------------------------------------------------------# + # 计算真实框的左上角和右下角 + #-----------------------------------------------------------# + b1_x1, b1_x2 = _box_a[:, 0] - _box_a[:, 2] / 2, _box_a[:, 0] + _box_a[:, 2] / 2 + b1_y1, b1_y2 = _box_a[:, 1] - _box_a[:, 3] / 2, _box_a[:, 1] + _box_a[:, 3] / 2 + #-----------------------------------------------------------# + # 计算先验框获得的预测框的左上角和右下角 + #-----------------------------------------------------------# + b2_x1, b2_x2 = _box_b[:, 0] - _box_b[:, 2] / 2, _box_b[:, 0] + _box_b[:, 2] / 2 + b2_y1, b2_y2 = _box_b[:, 1] - _box_b[:, 3] / 2, _box_b[:, 1] + _box_b[:, 3] / 2 + + #-----------------------------------------------------------# + # 将真实框和预测框都转化成左上角右下角的形式 + #-----------------------------------------------------------# + box_a = torch.zeros_like(_box_a) + box_b = torch.zeros_like(_box_b) + box_a[:, 0], box_a[:, 1], box_a[:, 2], box_a[:, 3] = b1_x1, b1_y1, b1_x2, b1_y2 + box_b[:, 0], box_b[:, 1], box_b[:, 2], box_b[:, 3] = b2_x1, b2_y1, b2_x2, b2_y2 + + #-----------------------------------------------------------# + # A为真实框的数量,B为先验框的数量 + #-----------------------------------------------------------# + A = box_a.size(0) + B = box_b.size(0) + + #-----------------------------------------------------------# + # 计算交的面积 + #-----------------------------------------------------------# + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + inter = inter[:, :, 0] * inter[:, :, 1] + #-----------------------------------------------------------# + # 计算预测框和真实框各自的面积 + #-----------------------------------------------------------# + area_a = ((box_a[:, 2]-box_a[:, 0]) * (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2]-box_b[:, 0]) * (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + #-----------------------------------------------------------# + # 求IOU + #-----------------------------------------------------------# + union = area_a + area_b - inter + return inter / union # [A,B] + + def get_target(self, l, targets, anchors, in_h, in_w): #-----------------------------------------------------# # 计算一共有多少张图片 #-----------------------------------------------------# - bs = len(target) - #-------------------------------------------------------# - # 获得当前特征层先验框所属的编号,方便后面对先验框筛选 - #-------------------------------------------------------# - anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] - subtract_index = [0,3,6][self.feature_length.index(in_w)] - #-------------------------------------------------------# - # 创建全是0或者全是1的阵列 - #-------------------------------------------------------# - mask = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - noobj_mask = torch.ones(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - - tx = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - ty = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - tw = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - th = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - t_box = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, 4, requires_grad=False) - tconf = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - tcls = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, self.num_classes, requires_grad=False) - - box_loss_scale_x = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - box_loss_scale_y = torch.zeros(bs, int(self.num_anchors/3), in_h, in_w, requires_grad=False) - for b in range(bs): - if len(target[b])==0: + bs = len(targets) + #-----------------------------------------------------# + # 用于选取哪些先验框不包含物体 + #-----------------------------------------------------# + noobj_mask = torch.ones(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False) + #-----------------------------------------------------# + # 让网络更加去关注小目标 + #-----------------------------------------------------# + box_loss_scale = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, requires_grad = False) + #-----------------------------------------------------# + # batch_size, 3, 13, 13, 5 + num_classes + #-----------------------------------------------------# + y_true = torch.zeros(bs, len(self.anchors_mask[l]), in_h, in_w, self.bbox_attrs, requires_grad = False) + for b in range(bs): + if len(targets[b])==0: continue + batch_target = torch.zeros_like(targets[b]) #-------------------------------------------------------# # 计算出正样本在特征层上的中心点 #-------------------------------------------------------# - gxs = target[b][:, 0:1] * in_w - gys = target[b][:, 1:2] * in_h - - #-------------------------------------------------------# - # 计算出正样本相对于特征层的宽高 - #-------------------------------------------------------# - gws = target[b][:, 2:3] * in_w - ghs = target[b][:, 3:4] * in_h - - #-------------------------------------------------------# - # 计算出正样本属于特征层的哪个特征点 - #-------------------------------------------------------# - gis = torch.floor(gxs) - gjs = torch.floor(gys) + batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w + batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h + batch_target[:, 4] = targets[b][:, 4] + batch_target = batch_target.cpu() #-------------------------------------------------------# # 将真实框转换一个形式 # num_true_box, 4 #-------------------------------------------------------# - gt_box = torch.FloatTensor(torch.cat([torch.zeros_like(gws), torch.zeros_like(ghs), gws, ghs], 1)) - + gt_box = torch.FloatTensor(torch.cat((torch.zeros((batch_target.size(0), 2)), batch_target[:, 2:4]), 1)) #-------------------------------------------------------# # 将先验框转换一个形式 # 9, 4 #-------------------------------------------------------# - anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((self.num_anchors, 2)), torch.FloatTensor(anchors)), 1)) + anchor_shapes = torch.FloatTensor(torch.cat((torch.zeros((len(anchors), 2)), torch.FloatTensor(anchors)), 1)) #-------------------------------------------------------# # 计算交并比 - # num_true_box, 9 + # self.calculate_iou(gt_box, anchor_shapes) = [num_true_box, 9]每一个真实框和9个先验框的重合情况 + # best_ns: + # [每个真实框最大的重合度max_iou, 每一个真实框最重合的先验框的序号] #-------------------------------------------------------# - anch_ious = jaccard(gt_box, anchor_shapes) + best_ns = torch.argmax(self.calculate_iou(gt_box, anchor_shapes), dim=-1) - #-------------------------------------------------------# - # 计算重合度最大的先验框是哪个 - # num_true_box, - #-------------------------------------------------------# - best_ns = torch.argmax(anch_ious,dim=-1) - for i, best_n in enumerate(best_ns): - if best_n not in anchor_index: + for t, best_n in enumerate(best_ns): + if best_n not in self.anchors_mask[l]: continue - #-------------------------------------------------------------# - # 取出各类坐标: - # gi和gj代表的是真实框对应的特征点的x轴y轴坐标 - # gx和gy代表真实框的x轴和y轴坐标 - # gw和gh代表真实框的宽和高 - #-------------------------------------------------------------# - gi = gis[i].long() - gj = gjs[i].long() - gx = gxs[i] - gy = gys[i] - gw = gws[i] - gh = ghs[i] - if (gj < in_h) and (gi < in_w): - best_n = best_n - subtract_index - #----------------------------------------# - # noobj_mask代表无目标的特征点 - #----------------------------------------# - noobj_mask[b, best_n, gj, gi] = 0 - #----------------------------------------# - # mask代表有目标的特征点 - #----------------------------------------# - mask[b, best_n, gj, gi] = 1 - #----------------------------------------# - # tx、ty代表中心的真实值 - #----------------------------------------# - tx[b, best_n, gj, gi] = gx - ty[b, best_n, gj, gi] = gy - #----------------------------------------# - # tw、th代表宽高的真实值 - #----------------------------------------# - tw[b, best_n, gj, gi] = gw - th[b, best_n, gj, gi] = gh - #----------------------------------------# - # 用于获得xywh的比例 - # 大目标loss权重小,小目标loss权重大 - #----------------------------------------# - box_loss_scale_x[b, best_n, gj, gi] = target[b][i, 2] - box_loss_scale_y[b, best_n, gj, gi] = target[b][i, 3] - #----------------------------------------# - # tconf代表物体置信度 - #----------------------------------------# - tconf[b, best_n, gj, gi] = 1 - #----------------------------------------# - # tcls代表种类置信度 - #----------------------------------------# - tcls[b, best_n, gj, gi, target[b][i, 4].long()] = 1 - else: - print('Step {0} out of bound'.format(b)) - print('gj: {0}, height: {1} | gi: {2}, width: {3}'.format(gj, in_h, gi, in_w)) - continue - t_box[...,0] = tx - t_box[...,1] = ty - t_box[...,2] = tw - t_box[...,3] = th - return mask, noobj_mask, t_box, tconf, tcls, box_loss_scale_x, box_loss_scale_y - - - def get_ignore(self,prediction,target,scaled_anchors,in_w, in_h,noobj_mask): + #----------------------------------------# + # 判断这个先验框是当前特征点的哪一个先验框 + #----------------------------------------# + k = self.anchors_mask[l].index(best_n) + #----------------------------------------# + # 获得真实框属于哪个网格点 + #----------------------------------------# + i = torch.floor(batch_target[t, 0]).long() + j = torch.floor(batch_target[t, 1]).long() + #----------------------------------------# + # 取出真实框的种类 + #----------------------------------------# + c = batch_target[t, 4].long() + + #----------------------------------------# + # noobj_mask代表无目标的特征点 + #----------------------------------------# + noobj_mask[b, k, j, i] = 0 + #----------------------------------------# + # tx、ty代表中心调整参数的真实值 + #----------------------------------------# + y_true[b, k, j, i, 0] = batch_target[t, 0] + y_true[b, k, j, i, 1] = batch_target[t, 1] + y_true[b, k, j, i, 2] = batch_target[t, 2] + y_true[b, k, j, i, 3] = batch_target[t, 3] + y_true[b, k, j, i, 4] = 1 + y_true[b, k, j, i, c + 5] = 1 + #----------------------------------------# + # 用于获得xywh的比例 + # 大目标loss权重小,小目标loss权重大 + #----------------------------------------# + box_loss_scale[b, k, j, i] = batch_target[t, 2] * batch_target[t, 3] / in_w / in_h + return y_true, noobj_mask, box_loss_scale + + def get_ignore(self, l, x, y, h, w, targets, scaled_anchors, in_h, in_w, noobj_mask): #-----------------------------------------------------# # 计算一共有多少张图片 #-----------------------------------------------------# - bs = len(target) - #-------------------------------------------------------# - # 获得当前特征层先验框所属的编号,方便后面对先验框筛选 - #-------------------------------------------------------# - anchor_index = [[0,1,2],[3,4,5],[6,7,8]][self.feature_length.index(in_w)] - scaled_anchors = np.array(scaled_anchors)[anchor_index] - - # 先验框的中心位置的调整参数 - x = torch.sigmoid(prediction[..., 0]) - y = torch.sigmoid(prediction[..., 1]) - # 先验框的宽高调整参数 - w = prediction[..., 2] # Width - h = prediction[..., 3] # Height + bs = len(targets) FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor - LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor - - # 生成网格,先验框中心,网格左上角 + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + #-----------------------------------------------------# + # 生成网格,先验框中心,网格左上角 + #-----------------------------------------------------# grid_x = torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat( - int(bs*self.num_anchors/3), 1, 1).view(x.shape).type(FloatTensor) + int(bs * len(self.anchors_mask[l])), 1, 1).view(x.shape).type(FloatTensor) grid_y = torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat( - int(bs*self.num_anchors/3), 1, 1).view(y.shape).type(FloatTensor) + int(bs * len(self.anchors_mask[l])), 1, 1).view(y.shape).type(FloatTensor) # 生成先验框的宽高 - anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) - anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) + scaled_anchors_l = np.array(scaled_anchors)[self.anchors_mask[l]] + anchor_w = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([0])) + anchor_h = FloatTensor(scaled_anchors_l).index_select(1, LongTensor([1])) anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape) anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape) - #-------------------------------------------------------# # 计算调整后的先验框中心与宽高 #-------------------------------------------------------# - pred_boxes = FloatTensor(prediction[..., :4].shape) - pred_boxes[..., 0] = x + grid_x - pred_boxes[..., 1] = y + grid_y - pred_boxes[..., 2] = torch.exp(w) * anchor_w - pred_boxes[..., 3] = torch.exp(h) * anchor_h - for i in range(bs): - pred_boxes_for_ignore = pred_boxes[i] + pred_boxes_x = torch.unsqueeze(x + grid_x, -1) + pred_boxes_y = torch.unsqueeze(y + grid_y, -1) + pred_boxes_w = torch.unsqueeze(torch.exp(w) * anchor_w, -1) + pred_boxes_h = torch.unsqueeze(torch.exp(h) * anchor_h, -1) + pred_boxes = torch.cat([pred_boxes_x, pred_boxes_y, pred_boxes_w, pred_boxes_h], dim = -1) + for b in range(bs): #-------------------------------------------------------# # 将预测结果转换一个形式 # pred_boxes_for_ignore num_anchors, 4 #-------------------------------------------------------# - pred_boxes_for_ignore = pred_boxes_for_ignore.view(-1, 4) + pred_boxes_for_ignore = pred_boxes[b].view(-1, 4) #-------------------------------------------------------# # 计算真实框,并把真实框转换成相对于特征层的大小 # gt_box num_true_box, 4 #-------------------------------------------------------# - if len(target[i]) > 0: - gx = target[i][:, 0:1] * in_w - gy = target[i][:, 1:2] * in_h - gw = target[i][:, 2:3] * in_w - gh = target[i][:, 3:4] * in_h - gt_box = torch.FloatTensor(torch.cat([gx, gy, gw, gh],-1)).type(FloatTensor) - + if len(targets[b]) > 0: + batch_target = torch.zeros_like(targets[b]) + #-------------------------------------------------------# + # 计算出正样本在特征层上的中心点 + #-------------------------------------------------------# + batch_target[:, [0,2]] = targets[b][:, [0,2]] * in_w + batch_target[:, [1,3]] = targets[b][:, [1,3]] * in_h + batch_target = batch_target[:, :4] #-------------------------------------------------------# # 计算交并比 # anch_ious num_true_box, num_anchors #-------------------------------------------------------# - anch_ious = jaccard(gt_box, pred_boxes_for_ignore) + anch_ious = self.calculate_iou(batch_target, pred_boxes_for_ignore) #-------------------------------------------------------# # 每个先验框对应真实框的最大重合度 # anch_ious_max num_anchors #-------------------------------------------------------# - anch_ious_max, _ = torch.max(anch_ious,dim=0) - anch_ious_max = anch_ious_max.view(pred_boxes[i].size()[:3]) - noobj_mask[i][anch_ious_max>self.ignore_threshold] = 0 + anch_ious_max, _ = torch.max(anch_ious, dim = 0) + anch_ious_max = anch_ious_max.view(pred_boxes[b].size()[:3]) + noobj_mask[b][anch_ious_max > self.ignore_threshold] = 0 return noobj_mask, pred_boxes -def weights_init(net, init_type='normal', init_gain=0.02): +def weights_init(net, init_type='normal', init_gain = 0.02): def init_func(m): classname = m.__class__.__name__ if hasattr(m, 'weight') and classname.find('Conv') != -1: @@ -444,51 +413,3 @@ def weights_init(net, init_type='normal', init_gain=0.02): torch.nn.init.constant_(m.bias.data, 0.0) print('initialize network with %s type' % init_type) net.apply(init_func) - -class LossHistory(): - def __init__(self, log_dir): - import datetime - curr_time = datetime.datetime.now() - time_str = datetime.datetime.strftime(curr_time,'%Y_%m_%d_%H_%M_%S') - self.log_dir = log_dir - self.time_str = time_str - self.save_path = os.path.join(self.log_dir, "loss_" + str(self.time_str)) - self.losses = [] - self.val_loss = [] - - os.makedirs(self.save_path) - - def append_loss(self, loss, val_loss): - self.losses.append(loss) - self.val_loss.append(val_loss) - with open(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f: - f.write(str(loss)) - f.write("\n") - with open(os.path.join(self.save_path, "epoch_val_loss_" + str(self.time_str) + ".txt"), 'a') as f: - f.write(str(val_loss)) - f.write("\n") - self.loss_plot() - - def loss_plot(self): - iters = range(len(self.losses)) - - plt.figure() - plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss') - plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss') - try: - if len(self.losses) < 25: - num = 5 - else: - num = 15 - - plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss') - plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss') - except: - pass - - plt.grid(True) - plt.xlabel('Epoch') - plt.ylabel('Loss') - plt.legend(loc="upper right") - - plt.savefig(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".png")) diff --git a/predict.py b/predict.py index 65a8aa1c6651a9c8dd98a7baf3b2701cbc52ed95..15bd461186d0ba77c4c66fcb8fe212ebc5e354cf 100644 --- a/predict.py +++ b/predict.py @@ -1,8 +1,7 @@ -#----------------------------------------------------# -# 对视频中的predict.py进行了修改, -# 将单张图片预测、摄像头检测和FPS测试功能 +#-----------------------------------------------------------------------# +# predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能 # 整合到了一个py文件中,通过指定mode进行模式的修改。 -#----------------------------------------------------# +#-----------------------------------------------------------------------# import time import cv2 @@ -13,33 +12,44 @@ from yolo import YOLO if __name__ == "__main__": yolo = YOLO() - #-------------------------------------------------------------------------# + #----------------------------------------------------------------------------------------------------------# # mode用于指定测试的模式: - # 'predict'表示单张图片预测 - # 'video'表示视频检测 - # 'fps'表示测试fps - #-------------------------------------------------------------------------# + # 'predict'表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释 + # 'video'表示视频检测,可调用摄像头或者视频进行检测,详情查看下方注释。 + # 'fps'表示测试fps,使用的图片是img里面的street.jpg,详情查看下方注释。 + # 'dir_predict'表示遍历文件夹进行检测并保存。默认遍历img文件夹,保存img_out文件夹,详情查看下方注释。 + #----------------------------------------------------------------------------------------------------------# mode = "predict" #-------------------------------------------------------------------------# # video_path用于指定视频的路径,当video_path=0时表示检测摄像头 # video_save_path表示视频保存的路径,当video_save_path=""时表示不保存 # video_fps用于保存的视频的fps # video_path、video_save_path和video_fps仅在mode='video'时有效 - # 保存视频时需要ctrl+c退出才会完成完整的保存步骤,不可直接结束程序。 + # 保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。 #-------------------------------------------------------------------------# video_path = 0 video_save_path = "" video_fps = 25.0 + #-------------------------------------------------------------------------# + # test_interval用于指定测量fps的时候,图片检测的次数 + # 理论上test_interval越大,fps越准确。 + #-------------------------------------------------------------------------# + test_interval = 100 + #-------------------------------------------------------------------------# + # dir_origin_path指定了用于检测的图片的文件夹路径 + # dir_save_path指定了检测完图片的保存路径 + # dir_origin_path和dir_save_path仅在mode='dir_predict'时有效 + #-------------------------------------------------------------------------# + dir_origin_path = "img/" + dir_save_path = "img_out/" if mode == "predict": ''' - 1、该代码无法直接进行批量预测,如果想要批量预测,可以利用os.listdir()遍历文件夹,利用Image.open打开图片文件进行预测。 - 具体流程可以参考get_dr_txt.py,在get_dr_txt.py即实现了遍历还实现了目标信息的保存。 - 2、如果想要进行检测完的图片的保存,利用r_image.save("img.jpg")即可保存,直接在predict.py里进行修改即可。 - 3、如果想要获得预测框的坐标,可以进入yolo.detect_image函数,在绘图部分读取top,left,bottom,right这四个值。 - 4、如果想要利用预测框截取下目标,可以进入yolo.detect_image函数,在绘图部分利用获取到的top,left,bottom,right这四个值 + 1、如果想要进行检测完的图片的保存,利用r_image.save("img.jpg")即可保存,直接在predict.py里进行修改即可。 + 2、如果想要获得预测框的坐标,可以进入yolo.detect_image函数,在绘图部分读取top,left,bottom,right这四个值。 + 3、如果想要利用预测框截取下目标,可以进入yolo.detect_image函数,在绘图部分利用获取到的top,left,bottom,right这四个值 在原图上利用矩阵的方式进行截取。 - 5、如果想要在预测图上写额外的字,比如检测到的特定目标的数量,可以进入yolo.detect_image函数,在绘图部分对predicted_class进行判断, + 4、如果想要在预测图上写额外的字,比如检测到的特定目标的数量,可以进入yolo.detect_image函数,在绘图部分对predicted_class进行判断, 比如判断if predicted_class == 'car': 即可判断当前目标是否为车,然后记录数量即可。利用draw.text即可写字。 ''' while True: @@ -54,11 +64,11 @@ if __name__ == "__main__": r_image.show() elif mode == "video": - capture=cv2.VideoCapture(video_path) + capture = cv2.VideoCapture(video_path) if video_save_path!="": - fourcc = cv2.VideoWriter_fourcc(*'XVID') - size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) - out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size) + fourcc = cv2.VideoWriter_fourcc(*'XVID') + size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) + out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size) fps = 0.0 while(True): @@ -91,9 +101,23 @@ if __name__ == "__main__": cv2.destroyAllWindows() elif mode == "fps": - test_interval = 100 img = Image.open('img/street.jpg') tact_time = yolo.get_FPS(img, test_interval) print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1') + + elif mode == "dir_predict": + import os + from tqdm import tqdm + + img_names = os.listdir(dir_origin_path) + for img_name in tqdm(img_names): + if img_name.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')): + image_path = os.path.join(dir_origin_path, img_name) + image = Image.open(image_path) + r_image = yolo.detect_image(image) + if not os.path.exists(dir_save_path): + os.makedirs(dir_save_path) + r_image.save(os.path.join(dir_save_path, img_name)) + else: - raise AssertionError("Please specify the correct mode: 'predict', 'video' or 'fps'.") + raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps' or 'dir_predict'.") diff --git a/summary.py b/summary.py new file mode 100644 index 0000000000000000000000000000000000000000..0a36d41ea6d12b36d403e87d371fb0132c7dde4e --- /dev/null +++ b/summary.py @@ -0,0 +1,13 @@ +#--------------------------------------------# +# 该部分代码用于看网络结构 +#--------------------------------------------# +import torch +from torchsummary import summary + +from nets.yolo import YoloBody + +if __name__ == "__main__": + # 需要使用device来指定网络在GPU还是CPU运行 + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + m = YoloBody([[6, 7, 8], [3, 4, 5], [0, 1, 2]], 80).to(device) + summary(m, input_size=(3, 416, 416)) diff --git a/train.py b/train.py index 56a70e7e4b58ac5141a9108edfe8340f47e26669..3563d416cbf6c8c90bd73fa09aa436a5ddc26bb2 100644 --- a/train.py +++ b/train.py @@ -6,326 +6,209 @@ import torch import torch.backends.cudnn as cudnn import torch.optim as optim from torch.utils.data import DataLoader -from tqdm import tqdm -from nets.yolo4 import YoloBody -from nets.yolo_training import LossHistory, YOLOLoss, weights_init +from nets.yolo import YoloBody +from nets.yolo_training import YOLOLoss, weights_init +from utils.callbacks import LossHistory from utils.dataloader import YoloDataset, yolo_dataset_collate +from utils.utils import get_anchors, get_classes +from utils.utils_fit import fit_one_epoch - -#---------------------------------------------------# -# 获得类和先验框 -#---------------------------------------------------# -def get_classes(classes_path): - '''loads the classes''' - with open(classes_path) as f: - class_names = f.readlines() - class_names = [c.strip() for c in class_names] - return class_names - -def get_anchors(anchors_path): - '''loads the anchors from a file''' - with open(anchors_path) as f: - anchors = f.readline() - anchors = [float(x) for x in anchors.split(',')] - return np.array(anchors).reshape([-1,3,2])[::-1,:,:] - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def fit_one_epoch(net,yolo_loss,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,cuda): - if Tensorboard: - global train_tensorboard_step, val_tensorboard_step - total_loss = 0 - val_loss = 0 - - net.train() - print('Start Train') - with tqdm(total=epoch_size,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: - for iteration, batch in enumerate(gen): - if iteration >= epoch_size: - break - images, targets = batch[0], batch[1] - with torch.no_grad(): - if cuda: - images = torch.from_numpy(images).type(torch.FloatTensor).cuda() - targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets] - else: - images = torch.from_numpy(images).type(torch.FloatTensor) - targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets] - #----------------------# - # 清零梯度 - #----------------------# - optimizer.zero_grad() - #----------------------# - # 前向传播 - #----------------------# - outputs = net(images) - losses = [] - num_pos_all = 0 - #----------------------# - # 计算损失 - #----------------------# - for i in range(3): - loss_item, num_pos = yolo_loss(outputs[i], targets) - losses.append(loss_item) - num_pos_all += num_pos - - loss = sum(losses) / num_pos_all - total_loss += loss.item() - - #----------------------# - # 反向传播 - #----------------------# - loss.backward() - optimizer.step() - - if Tensorboard: - # 将loss写入tensorboard,每一步都写 - writer.add_scalar('Train_loss', loss, train_tensorboard_step) - train_tensorboard_step += 1 - - pbar.set_postfix(**{'total_loss': total_loss / (iteration + 1), - 'lr' : get_lr(optimizer)}) - pbar.update(1) - - # 将loss写入tensorboard,下面注释的是每个世代保存一次 - # if Tensorboard: - # writer.add_scalar('Train_loss', total_loss/(iteration+1), epoch) - net.eval() - print('Start Validation') - with tqdm(total=epoch_size_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: - for iteration, batch in enumerate(genval): - if iteration >= epoch_size_val: - break - images_val, targets_val = batch[0], batch[1] - - with torch.no_grad(): - if cuda: - images_val = torch.from_numpy(images_val).type(torch.FloatTensor).cuda() - targets_val = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets_val] - else: - images_val = torch.from_numpy(images_val).type(torch.FloatTensor) - targets_val = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets_val] - optimizer.zero_grad() - - outputs = net(images_val) - losses = [] - num_pos_all = 0 - for i in range(3): - loss_item, num_pos = yolo_loss(outputs[i], targets_val) - losses.append(loss_item) - num_pos_all += num_pos - loss = sum(losses) / num_pos_all - val_loss += loss.item() - - # 将loss写入tensorboard, 下面注释的是每一步都写 - # if Tensorboard: - # writer.add_scalar('Val_loss', loss, val_tensorboard_step) - # val_tensorboard_step += 1 - pbar.set_postfix(**{'total_loss': val_loss / (iteration + 1)}) - pbar.update(1) - - # 将loss写入tensorboard,每个世代保存一次 - if Tensorboard: - writer.add_scalar('Val_loss',val_loss / (epoch_size_val+1), epoch) - loss_history.append_loss(total_loss/(epoch_size+1), val_loss/(epoch_size_val+1)) - print('Finish Validation') - print('Epoch:'+ str(epoch+1) + '/' + str(Epoch)) - print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) - print('Saving state, iter:', str(epoch+1)) - torch.save(model.state_dict(), 'logs/Epoch%d-Total_Loss%.4f-Val_Loss%.4f.pth'%((epoch+1),total_loss/(epoch_size+1),val_loss/(epoch_size_val+1))) - -#----------------------------------------------------# -# 检测精度mAP和pr曲线计算参考视频 -# https://www.bilibili.com/video/BV1zE411u7Vw -#----------------------------------------------------# if __name__ == "__main__": - #-------------------------------# - # 是否使用Tensorboard - #-------------------------------# - Tensorboard = False #-------------------------------# # 是否使用Cuda # 没有GPU可以设置成False #-------------------------------# Cuda = True + #--------------------------------------------------------# + # 训练前一定要修改classes_path,使其对应自己的数据集 + #--------------------------------------------------------# + classes_path = 'model_data/voc_classes.txt' + #---------------------------------------------------------------------# + # anchors_path代表先验框对应的txt文件,一般不修改。 + # anchors_mask用于帮助代码找到对应的先验框,一般不修改。 + #---------------------------------------------------------------------# + anchors_path = 'model_data/yolo_anchors.txt' + anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + #------------------------------------------------------------------------------------------------------# + # 权值文件请看README,百度网盘下载。数据的预训练权重对不同数据集是通用的,因为特征是通用的 + # 预训练权重对于99%的情况都必须要用,不用的话权值太过随机,特征提取效果不明显,网络训练的结果也不会好。 + # 训练自己的数据集时提示维度不匹配正常,预测的东西都不一样了自然维度不匹配 + # 如果想要断点续练就将model_path设置成logs文件夹下已经训练的权值文件。 + #------------------------------------------------------------------------------------------------------# + model_path = 'model_data/yolo4_weights.pth' #------------------------------------------------------# - # 是否对损失进行归一化,用于改变loss的大小 - # 用于决定计算最终loss是除上batch_size还是除上正样本数量 + # 输入的shape大小,一定要是32的倍数 #------------------------------------------------------# - normalize = False - #-------------------------------# - # 输入的shape大小 - # 显存比较小可以使用416x416 - # 显存比较大可以使用608x608 - #-------------------------------# - input_shape = (416,416) - #----------------------------------------------------# - # classes和anchor的路径,非常重要 - # 训练前一定要修改classes_path,使其对应自己的数据集 - #----------------------------------------------------# - anchors_path = 'model_data/yolo_anchors.txt' - classes_path = 'model_data/voc_classes.txt' + input_shape = [416, 416] #------------------------------------------------------# # Yolov4的tricks应用 # mosaic 马赛克数据增强 True or False # 实际测试时mosaic数据增强并不稳定,所以默认为False - # Cosine_scheduler 余弦退火学习率 True or False + # Cosine_lr 余弦退火学习率 True or False # label_smoothing 标签平滑 0.01以下一般 如0.01、0.005 #------------------------------------------------------# - mosaic = False - Cosine_lr = False - smoooth_label = 0 + mosaic = False + Cosine_lr = False + label_smoothing = 0 + + #----------------------------------------------------# + # 训练分为两个阶段,分别是冻结阶段和解冻阶段。 + # 显存不足与数据集大小无关,提示显存不足请调小batch_size。 + # 受到BatchNorm层影响,batch_size最小为2,不能为1。 + #----------------------------------------------------# + #----------------------------------------------------# + # 冻结阶段训练参数 + # 此时模型的主干被冻结了,特征提取网络不发生改变 + # 占用的显存较小,仅对网络进行微调 + #----------------------------------------------------# + Init_Epoch = 0 + Freeze_Epoch = 50 + Freeze_batch_size = 8 + Freeze_lr = 1e-3 + #----------------------------------------------------# + # 解冻阶段训练参数 + # 此时模型的主干不被冻结了,特征提取网络会发生改变 + # 占用的显存较大,网络所有的参数都会发生改变 + #----------------------------------------------------# + UnFreeze_Epoch = 100 + Unfreeze_batch_size = 4 + Unfreeze_lr = 1e-4 + #------------------------------------------------------# + # 是否进行冻结训练,默认先冻结主干训练后解冻训练。 + #------------------------------------------------------# + Freeze_Train = True + #------------------------------------------------------# + # 用于设置是否使用多线程读取数据 + # 开启后会加快数据读取速度,但是会占用更多内存 + # 内存较小的电脑可以设置为2或者0 + #------------------------------------------------------# + num_workers = 4 + #----------------------------------------------------# + # 获得图片路径和标签 + #----------------------------------------------------# + train_annotation_path = '2007_train.txt' + val_annotation_path = '2007_val.txt' #----------------------------------------------------# # 获取classes和anchor #----------------------------------------------------# - class_names = get_classes(classes_path) - anchors = get_anchors(anchors_path) - num_classes = len(class_names) + class_names, num_classes = get_classes(classes_path) + anchors, num_anchors = get_anchors(anchors_path) #------------------------------------------------------# # 创建yolo模型 - # 训练前一定要修改classes_path和对应的txt文件 #------------------------------------------------------# - model = YoloBody(len(anchors[0]), num_classes) + model = YoloBody(anchors_mask, num_classes) weights_init(model) - #------------------------------------------------------# # 权值文件请看README,百度网盘下载 #------------------------------------------------------# - model_path = "model_data/yolo4_weights.pth" - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model_dict = model.state_dict() - pretrained_dict = torch.load(model_path, map_location=device) - pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)} + print('Load weights {}.'.format(model_path)) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model_dict = model.state_dict() + pretrained_dict = torch.load(model_path, map_location = device) + pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)} model_dict.update(pretrained_dict) model.load_state_dict(model_dict) - print('Finished!') - - net = model.train() + model_train = model.train() if Cuda: - net = torch.nn.DataParallel(model) + model_train = torch.nn.DataParallel(model) cudnn.benchmark = True - net = net.cuda() + model_train = model_train.cuda() - yolo_loss = YOLOLoss(np.reshape(anchors,[-1,2]), num_classes, (input_shape[1], input_shape[0]), smoooth_label, Cuda, normalize) + yolo_loss = YOLOLoss(anchors, num_classes, input_shape, Cuda, anchors_mask, label_smoothing) loss_history = LossHistory("logs/") - #----------------------------------------------------# - # 获得图片路径和标签 - #----------------------------------------------------# - annotation_path = '2007_train.txt' - #----------------------------------------------------------------------# - # 验证集的划分在train.py代码里面进行 - # 2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。 - # 当前划分方式下,验证集和训练集的比例为1:9 - #----------------------------------------------------------------------# - val_split = 0.1 - with open(annotation_path) as f: - lines = f.readlines() - np.random.seed(10101) - np.random.shuffle(lines) - np.random.seed(None) - num_val = int(len(lines)*val_split) - num_train = len(lines) - num_val - - if Tensorboard: - from tensorboardX import SummaryWriter - writer = SummaryWriter(log_dir='logs',flush_secs=60) - if Cuda: - graph_inputs = torch.randn(1,3,input_shape[0],input_shape[1]).type(torch.FloatTensor).cuda() - else: - graph_inputs = torch.randn(1,3,input_shape[0],input_shape[1]).type(torch.FloatTensor) - writer.add_graph(model, graph_inputs) - train_tensorboard_step = 1 - val_tensorboard_step = 1 + #---------------------------# + # 读取数据集对应的txt + #---------------------------# + with open(train_annotation_path) as f: + train_lines = f.readlines() + with open(val_annotation_path) as f: + val_lines = f.readlines() + num_train = len(train_lines) + num_val = len(val_lines) #------------------------------------------------------# # 主干特征提取网络特征通用,冻结训练可以加快训练速度 # 也可以在训练初期防止权值被破坏。 # Init_Epoch为起始世代 # Freeze_Epoch为冻结训练的世代 - # Epoch总训练世代 + # UnFreeze_Epoch总训练世代 # 提示OOM或者显存不足请调小Batch_size #------------------------------------------------------# if True: - lr = 1e-3 - Batch_size = 4 - Init_Epoch = 0 - Freeze_Epoch = 50 + batch_size = Freeze_batch_size + lr = Freeze_lr + start_epoch = Init_Epoch + end_epoch = Freeze_Epoch - #----------------------------------------------------------------------------# - # 我在实际测试时,发现optimizer的weight_decay起到了反作用, - # 所以去除掉了weight_decay,大家也可以开起来试试,一般是weight_decay=5e-4 - #----------------------------------------------------------------------------# - optimizer = optim.Adam(net.parameters(),lr) + optimizer = optim.Adam(model_train.parameters(), lr, weight_decay = 5e-4) if Cosine_lr: lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5) else: lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.92) - train_dataset = YoloDataset(lines[:num_train], (input_shape[0], input_shape[1]), mosaic=mosaic, is_train=True) - val_dataset = YoloDataset(lines[num_train:], (input_shape[0], input_shape[1]), mosaic=False, is_train=False) - gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True, - drop_last=True, collate_fn=yolo_dataset_collate) - gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True, - drop_last=True, collate_fn=yolo_dataset_collate) - - epoch_size = num_train // Batch_size - epoch_size_val = num_val // Batch_size + train_dataset = YoloDataset(train_lines, input_shape, num_classes, mosaic=mosaic, train = True) + val_dataset = YoloDataset(val_lines, input_shape, num_classes, mosaic=False, train = False) + gen = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate) + gen_val = DataLoader(val_dataset , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate) + + epoch_step = num_train // batch_size + epoch_step_val = num_val // batch_size - if epoch_size == 0 or epoch_size_val == 0: + if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法进行训练,请扩充数据集。") + #------------------------------------# # 冻结一定部分训练 #------------------------------------# - for param in model.backbone.parameters(): - param.requires_grad = False + if Freeze_Train: + for param in model.backbone.parameters(): + param.requires_grad = False - for epoch in range(Init_Epoch,Freeze_Epoch): - fit_one_epoch(net,yolo_loss,epoch,epoch_size,epoch_size_val,gen,gen_val,Freeze_Epoch,Cuda) + for epoch in range(start_epoch, end_epoch): + fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, + epoch_step, epoch_step_val, gen, gen_val, end_epoch, Cuda) lr_scheduler.step() - + if True: - lr = 1e-4 - Batch_size = 2 - Freeze_Epoch = 50 - Unfreeze_Epoch = 100 - - #----------------------------------------------------------------------------# - # 我在实际测试时,发现optimizer的weight_decay起到了反作用, - # 所以去除掉了weight_decay,大家也可以开起来试试,一般是weight_decay=5e-4 - #----------------------------------------------------------------------------# - optimizer = optim.Adam(net.parameters(),lr) + batch_size = Unfreeze_batch_size + lr = Unfreeze_lr + start_epoch = Freeze_Epoch + end_epoch = UnFreeze_Epoch + + optimizer = optim.Adam(model_train.parameters(), lr, weight_decay = 5e-4) if Cosine_lr: lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-5) else: lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.92) - train_dataset = YoloDataset(lines[:num_train], (input_shape[0], input_shape[1]), mosaic=mosaic, is_train=True) - val_dataset = YoloDataset(lines[num_train:], (input_shape[0], input_shape[1]), mosaic=False, is_train=False) - gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=True, - drop_last=True, collate_fn=yolo_dataset_collate) - gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4,pin_memory=True, - drop_last=True, collate_fn=yolo_dataset_collate) - - epoch_size = num_train // Batch_size - epoch_size_val = num_val // Batch_size + train_dataset = YoloDataset(train_lines, input_shape, num_classes, mosaic=mosaic, train = True) + val_dataset = YoloDataset(val_lines, input_shape, num_classes, mosaic=False, train = False) + gen = DataLoader(train_dataset, shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate) + gen_val = DataLoader(val_dataset , shuffle = True, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate) + + epoch_step = num_train // batch_size + epoch_step_val = num_val // batch_size - if epoch_size == 0 or epoch_size_val == 0: + if epoch_step == 0 or epoch_step_val == 0: raise ValueError("数据集过小,无法进行训练,请扩充数据集。") + #------------------------------------# - # 解冻后训练 + # 冻结一定部分训练 #------------------------------------# - for param in model.backbone.parameters(): - param.requires_grad = True - - for epoch in range(Freeze_Epoch,Unfreeze_Epoch): - fit_one_epoch(net,yolo_loss,epoch,epoch_size,epoch_size_val,gen,gen_val,Unfreeze_Epoch,Cuda) - lr_scheduler.step() + if Freeze_Train: + for param in model.backbone.parameters(): + param.requires_grad = False + + for epoch in range(start_epoch, end_epoch): + fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, + epoch_step, epoch_step_val, gen, gen_val, end_epoch, Cuda) + lr_scheduler.step() \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4287ca8617970fa8fc025b75cb319c7032706910 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/utils/callbacks.py b/utils/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..0103c42b01c60563d323e0020b5478240443b4d3 --- /dev/null +++ b/utils/callbacks.py @@ -0,0 +1,56 @@ +import os + +import scipy.signal +from matplotlib import pyplot as plt + + +class LossHistory(): + def __init__(self, log_dir): + import datetime + curr_time = datetime.datetime.now() + time_str = datetime.datetime.strftime(curr_time,'%Y_%m_%d_%H_%M_%S') + self.log_dir = log_dir + self.time_str = time_str + self.save_path = os.path.join(self.log_dir, "loss_" + str(self.time_str)) + self.losses = [] + self.val_loss = [] + + os.makedirs(self.save_path) + + def append_loss(self, loss, val_loss): + self.losses.append(loss) + self.val_loss.append(val_loss) + with open(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f: + f.write(str(loss)) + f.write("\n") + with open(os.path.join(self.save_path, "epoch_val_loss_" + str(self.time_str) + ".txt"), 'a') as f: + f.write(str(val_loss)) + f.write("\n") + self.loss_plot() + + def loss_plot(self): + iters = range(len(self.losses)) + + plt.figure() + plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss') + plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss') + try: + if len(self.losses) < 25: + num = 5 + else: + num = 15 + + plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss') + plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss') + except: + pass + + plt.grid(True) + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.legend(loc="upper right") + + plt.savefig(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".png")) + + plt.cla() + plt.close("all") diff --git a/utils/dataloader.py b/utils/dataloader.py index 6ea068d20d9022dcaf9ce415d428b08bdbc691f5..71bb94f178a45d543300f53b10875da9f97f84e3 100644 --- a/utils/dataloader.py +++ b/utils/dataloader.py @@ -1,35 +1,71 @@ +from random import sample, shuffle + import cv2 import numpy as np from PIL import Image from torch.utils.data.dataset import Dataset -from utils.utils import merge_bboxes +from utils.utils import cvtColor, preprocess_input class YoloDataset(Dataset): - def __init__(self, train_lines, image_size, mosaic=True, is_train=True): + def __init__(self, annotation_lines, input_shape, num_classes, mosaic, train): super(YoloDataset, self).__init__() - - self.train_lines = train_lines - self.train_batches = len(train_lines) - self.image_size = image_size - self.mosaic = mosaic - self.flag = True - self.is_train = is_train + self.annotation_lines = annotation_lines + self.input_shape = input_shape + self.num_classes = num_classes + self.length = len(self.annotation_lines) + self.mosaic = mosaic + self.train = train def __len__(self): - return self.train_batches + return self.length + + def __getitem__(self, index): + index = index % self.length + #---------------------------------------------------# + # 训练时进行数据的随机增强 + # 验证时不进行数据的随机增强 + #---------------------------------------------------# + if self.mosaic: + if self.rand() < 0.5: + lines = sample(self.annotation_lines, 3) + lines.append(self.annotation_lines[index]) + shuffle(lines) + image, box = self.get_random_data_with_Mosaic(lines, self.input_shape) + else: + image, box = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train) + else: + image, box = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train) + image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1)) + box = np.array(box, dtype=np.float32) + if len(box) != 0: + box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1] + box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0] + + box[:, 2:4] = box[:, 2:4] - box[:, 0:2] + box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2 + return image, box def rand(self, a=0, b=1): - return np.random.rand() * (b - a) + a + return np.random.rand()*(b-a) + a def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True): - """实时数据增强的随机预处理""" - line = annotation_line.split() - image = Image.open(line[0]) - iw, ih = image.size - h, w = input_shape - box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]]) + line = annotation_line.split() + #------------------------------# + # 读取图像并转换成RGB图像 + #------------------------------# + image = Image.open(line[0]) + image = cvtColor(image) + #------------------------------# + # 获得图像的高宽与目标高宽 + #------------------------------# + iw, ih = image.size + h, w = input_shape + #------------------------------# + # 获得预测框 + #------------------------------# + box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) if not random: scale = min(w/iw, h/ih) @@ -38,56 +74,64 @@ class YoloDataset(Dataset): dx = (w-nw)//2 dy = (h-nh)//2 - image = image.resize((nw,nh), Image.BICUBIC) - new_image = Image.new('RGB', (w,h), (128,128,128)) + #---------------------------------# + # 将图像多余的部分加上灰条 + #---------------------------------# + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', (w,h), (128,128,128)) new_image.paste(image, (dx, dy)) - image_data = np.array(new_image, np.float32) + image_data = np.array(new_image, np.float32) - # 调整目标框坐标 - box_data = np.zeros((len(box), 5)) - if len(box) > 0: + #---------------------------------# + # 对真实框进行调整 + #---------------------------------# + if len(box)>0: np.random.shuffle(box) - box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx - box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy - box[:, 0:2][box[:, 0:2] < 0] = 0 - box[:, 2][box[:, 2] > w] = w - box[:, 3][box[:, 3] > h] = h + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h box_w = box[:, 2] - box[:, 0] box_h = box[:, 3] - box[:, 1] - box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框 - box_data = np.zeros((len(box), 5)) - box_data[:len(box)] = box - - return image_data, box_data - - # 调整图片大小 - new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter) + box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box + + return image_data, box + + #------------------------------------------# + # 对图像进行缩放并且进行长和宽的扭曲 + #------------------------------------------# + new_ar = w/h * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter) scale = self.rand(.25, 2) if new_ar < 1: - nh = int(scale * h) - nw = int(nh * new_ar) + nh = int(scale*h) + nw = int(nh*new_ar) else: - nw = int(scale * w) - nh = int(nw / new_ar) - image = image.resize((nw, nh), Image.BICUBIC) - - # 放置图片 - dx = int(self.rand(0, w - nw)) - dy = int(self.rand(0, h - nh)) - new_image = Image.new('RGB', (w, h), - (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))) + nw = int(scale*w) + nh = int(nw/new_ar) + image = image.resize((nw,nh), Image.BICUBIC) + + #------------------------------------------# + # 将图像多余的部分加上灰条 + #------------------------------------------# + dx = int(self.rand(0, w-nw)) + dy = int(self.rand(0, h-nh)) + new_image = Image.new('RGB', (w,h), (128,128,128)) new_image.paste(image, (dx, dy)) image = new_image - # 是否翻转图片 - flip = self.rand() < .5 - if flip: - image = image.transpose(Image.FLIP_LEFT_RIGHT) + #------------------------------------------# + # 翻转图像 + #------------------------------------------# + flip = self.rand()<.5 + if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) - # 色域变换 + #------------------------------------------# + # 色域扭曲 + #------------------------------------------# hue = self.rand(-hue, hue) - sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat) - val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val) + sat = self.rand(1, sat) if self.rand()<.5 else 1/self.rand(1, sat) + val = self.rand(1, val) if self.rand()<.5 else 1/self.rand(1, val) x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV) x[..., 0] += hue*360 x[..., 0][x[..., 0]>1] -= 1 @@ -99,112 +143,134 @@ class YoloDataset(Dataset): x[x<0] = 0 image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255 - # 调整目标框坐标 - box_data = np.zeros((len(box), 5)) - if len(box) > 0: + #---------------------------------# + # 对真实框进行调整 + #---------------------------------# + if len(box)>0: np.random.shuffle(box) - box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx - box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy - if flip: - box[:, [0, 2]] = w - box[:, [2, 0]] - box[:, 0:2][box[:, 0:2] < 0] = 0 - box[:, 2][box[:, 2] > w] = w - box[:, 3][box[:, 3] > h] = h + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + if flip: box[:, [0,2]] = w - box[:, [2,0]] + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h box_w = box[:, 2] - box[:, 0] box_h = box[:, 3] - box[:, 1] - box = box[np.logical_and(box_w > 1, box_h > 1)] # 保留有效框 - box_data = np.zeros((len(box), 5)) - box_data[:len(box)] = box - - return image_data, box_data - - def get_random_data_with_Mosaic(self, annotation_line, input_shape, hue=.1, sat=1.5, val=1.5): + box = box[np.logical_and(box_w>1, box_h>1)] + + return image_data, box + + def merge_bboxes(self, bboxes, cutx, cuty): + merge_bbox = [] + for i in range(len(bboxes)): + for box in bboxes[i]: + tmp_box = [] + x1, y1, x2, y2 = box[0], box[1], box[2], box[3] + + if i == 0: + if y1 > cuty or x1 > cutx: + continue + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if x2 >= cutx and x1 <= cutx: + x2 = cutx + + if i == 1: + if y2 < cuty or x1 > cutx: + continue + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if x2 >= cutx and x1 <= cutx: + x2 = cutx + + if i == 2: + if y2 < cuty or x2 < cutx: + continue + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if x2 >= cutx and x1 <= cutx: + x1 = cutx + + if i == 3: + if y1 > cuty or x2 < cutx: + continue + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if x2 >= cutx and x1 <= cutx: + x1 = cutx + tmp_box.append(x1) + tmp_box.append(y1) + tmp_box.append(x2) + tmp_box.append(y2) + tmp_box.append(box[-1]) + merge_bbox.append(tmp_box) + return merge_bbox + + def get_random_data_with_Mosaic(self, annotation_line, input_shape, max_boxes=100, hue=.1, sat=1.5, val=1.5): h, w = input_shape - min_offset_x = 0.3 - min_offset_y = 0.3 - scale_low = 1 - min(min_offset_x, min_offset_y) - scale_high = scale_low + 0.2 - - image_datas = [] - box_datas = [] - index = 0 - - place_x = [0, 0, int(w * min_offset_x), int(w * min_offset_x)] - place_y = [0, int(h * min_offset_y), int(h * min_offset_y), 0] + min_offset_x = self.rand(0.25, 0.75) + min_offset_y = self.rand(0.25, 0.75) + + nws = [ int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1))] + nhs = [ int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1))] + + place_x = [int(w*min_offset_x) - nws[0], int(w*min_offset_x) - nws[1], int(w*min_offset_x), int(w*min_offset_x)] + place_y = [int(h*min_offset_y) - nhs[0], int(h*min_offset_y), int(h*min_offset_y), int(h*min_offset_y) - nhs[3]] + + image_datas = [] + box_datas = [] + index = 0 for line in annotation_line: # 每一行进行分割 line_content = line.split() # 打开图片 image = Image.open(line_content[0]) - image = image.convert("RGB") + image = cvtColor(image) + # 图片的大小 iw, ih = image.size # 保存框的位置 - box = np.array([np.array(list(map(int, box.split(',')))) for box in line_content[1:]]) - + box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]]) + # 是否翻转图片 - flip = self.rand() < .5 - if flip and len(box) > 0: + flip = self.rand()<.5 + if flip and len(box)>0: image = image.transpose(Image.FLIP_LEFT_RIGHT) - box[:, [0, 2]] = iw - box[:, [2, 0]] - - # 对输入进来的图片进行缩放 - new_ar = w / h - scale = self.rand(scale_low, scale_high) - if new_ar < 1: - nh = int(scale * h) - nw = int(nh * new_ar) - else: - nw = int(scale * w) - nh = int(nw / new_ar) - image = image.resize((nw, nh), Image.BICUBIC) - - # 进行色域变换 - hue = self.rand(-hue, hue) - sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat) - val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val) - x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV) - x[..., 0] += hue*360 - x[..., 0][x[..., 0]>1] -= 1 - x[..., 0][x[..., 0]<0] += 1 - x[..., 1] *= sat - x[..., 2] *= val - x[x[:,:, 0]>360, 0] = 360 - x[:, :, 1:][x[:, :, 1:]>1] = 1 - x[x<0] = 0 - image = cv2.cvtColor(x, cv2.COLOR_HSV2RGB) # numpy array, 0 to 1 - - image = Image.fromarray((image * 255).astype(np.uint8)) + box[:, [0,2]] = iw - box[:, [2,0]] + + nw = nws[index] + nh = nhs[index] + image = image.resize((nw,nh), Image.BICUBIC) + # 将图片进行放置,分别对应四张分割图片的位置 dx = place_x[index] dy = place_y[index] - new_image = Image.new('RGB', (w, h), - (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))) + new_image = Image.new('RGB', (w,h), (128,128,128)) new_image.paste(image, (dx, dy)) image_data = np.array(new_image) index = index + 1 box_data = [] # 对box进行重新处理 - if len(box) > 0: + if len(box)>0: np.random.shuffle(box) - box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx - box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy - box[:, 0:2][box[:, 0:2] < 0] = 0 - box[:, 2][box[:, 2] > w] = w - box[:, 3][box[:, 3] > h] = h + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h box_w = box[:, 2] - box[:, 0] box_h = box[:, 3] - box[:, 1] - box = box[np.logical_and(box_w > 1, box_h > 1)] - box_data = np.zeros((len(box), 5)) + box = box[np.logical_and(box_w>1, box_h>1)] + box_data = np.zeros((len(box),5)) box_data[:len(box)] = box - + image_datas.append(image_data) box_datas.append(box_data) # 将图片分割,放在一起 - cutx = np.random.randint(int(w * min_offset_x), int(w * (1 - min_offset_x))) - cuty = np.random.randint(int(h * min_offset_y), int(h * (1 - min_offset_y))) + cutx = int(w * min_offset_x) + cuty = int(h * min_offset_y) new_image = np.zeros([h, w, 3]) new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :] @@ -212,47 +278,26 @@ class YoloDataset(Dataset): new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :] new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :] + # 进行色域变换 + hue = self.rand(-hue, hue) + sat = self.rand(1, sat) if self.rand()<.5 else 1/self.rand(1, sat) + val = self.rand(1, val) if self.rand()<.5 else 1/self.rand(1, val) + x = cv2.cvtColor(np.array(new_image/255,np.float32), cv2.COLOR_RGB2HSV) + x[..., 0] += hue*360 + x[..., 0][x[..., 0]>1] -= 1 + x[..., 0][x[..., 0]<0] += 1 + x[..., 1] *= sat + x[..., 2] *= val + x[x[:, :, 0]>360, 0] = 360 + x[:, :, 1:][x[:, :, 1:]>1] = 1 + x[x<0] = 0 + new_image = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255 + # 对框进行进一步的处理 - new_boxes = np.array(merge_bboxes(box_datas, cutx, cuty)) + new_boxes = self.merge_bboxes(box_datas, cutx, cuty) return new_image, new_boxes - def __getitem__(self, index): - lines = self.train_lines - n = self.train_batches - index = index % n - if self.mosaic: - if self.flag and (index + 4) < n: - img, y = self.get_random_data_with_Mosaic(lines[index:index + 4], self.image_size[0:2]) - else: - img, y = self.get_random_data(lines[index], self.image_size[0:2], random=self.is_train) - self.flag = bool(1-self.flag) - else: - img, y = self.get_random_data(lines[index], self.image_size[0:2], random=self.is_train) - - if len(y) != 0: - # 从坐标转换成0~1的百分比 - boxes = np.array(y[:, :4], dtype=np.float32) - boxes[:, 0] = boxes[:, 0] / self.image_size[1] - boxes[:, 1] = boxes[:, 1] / self.image_size[0] - boxes[:, 2] = boxes[:, 2] / self.image_size[1] - boxes[:, 3] = boxes[:, 3] / self.image_size[0] - - boxes = np.maximum(np.minimum(boxes, 1), 0) - boxes[:, 2] = boxes[:, 2] - boxes[:, 0] - boxes[:, 3] = boxes[:, 3] - boxes[:, 1] - - boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2 - boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2 - y = np.concatenate([boxes, y[:, -1:]], axis=-1) - - img = np.array(img, dtype=np.float32) - - tmp_inp = np.transpose(img / 255.0, (2, 0, 1)) - tmp_targets = np.array(y, dtype=np.float32) - return tmp_inp, tmp_targets - - # DataLoader中collate_fn使用 def yolo_dataset_collate(batch): images = [] @@ -261,5 +306,4 @@ def yolo_dataset_collate(batch): images.append(img) bboxes.append(box) images = np.array(images) - return images, bboxes - + return images, bboxes \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py index f842779534926ef58544c63a5a554b7157362e71..4c122c21174e44e9287393d2fda95500ba271783 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,374 +1,62 @@ -from __future__ import division - import numpy as np -import torch -import torch.nn as nn from PIL import Image -from torchvision.ops import nms - - -class DecodeBox(nn.Module): - def __init__(self, anchors, num_classes, img_size): - super(DecodeBox, self).__init__() - #-----------------------------------------------------------# - # 13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401] - # 26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146] - # 52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28] - #-----------------------------------------------------------# - self.anchors = anchors - self.num_anchors = len(anchors) - self.num_classes = num_classes - self.bbox_attrs = 5 + num_classes - self.img_size = img_size - - def forward(self, input): - #-----------------------------------------------# - # 输入的input一共有三个,他们的shape分别是 - # batch_size, 255, 13, 13 - # batch_size, 255, 26, 26 - # batch_size, 255, 52, 52 - #-----------------------------------------------# - batch_size = input.size(0) - input_height = input.size(2) - input_width = input.size(3) - - #-----------------------------------------------# - # 输入为416x416时 - # stride_h = stride_w = 32、16、8 - #-----------------------------------------------# - stride_h = self.img_size[1] / input_height - stride_w = self.img_size[0] / input_width - #-------------------------------------------------# - # 此时获得的scaled_anchors大小是相对于特征层的 - #-------------------------------------------------# - scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors] - - #-----------------------------------------------# - # 输入的input一共有三个,他们的shape分别是 - # batch_size, 3, 13, 13, 85 - # batch_size, 3, 26, 26, 85 - # batch_size, 3, 52, 52, 85 - #-----------------------------------------------# - prediction = input.view(batch_size, self.num_anchors, - self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous() - - # 先验框的中心位置的调整参数 - x = torch.sigmoid(prediction[..., 0]) - y = torch.sigmoid(prediction[..., 1]) - # 先验框的宽高调整参数 - w = prediction[..., 2] - h = prediction[..., 3] - # 获得置信度,是否有物体 - conf = torch.sigmoid(prediction[..., 4]) - # 种类置信度 - pred_cls = torch.sigmoid(prediction[..., 5:]) - - FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor - LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor - - #----------------------------------------------------------# - # 生成网格,先验框中心,网格左上角 - # batch_size,3,13,13 - #----------------------------------------------------------# - grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat( - batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor) - grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat( - batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor) - - #----------------------------------------------------------# - # 按照网格格式生成先验框的宽高 - # batch_size,3,13,13 - #----------------------------------------------------------# - anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) - anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) - anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape) - anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape) - - #----------------------------------------------------------# - # 利用预测结果对先验框进行调整 - # 首先调整先验框的中心,从先验框中心向右下角偏移 - # 再调整先验框的宽高。 - #----------------------------------------------------------# - pred_boxes = FloatTensor(prediction[..., :4].shape) - pred_boxes[..., 0] = x.data + grid_x - pred_boxes[..., 1] = y.data + grid_y - pred_boxes[..., 2] = torch.exp(w.data) * anchor_w - pred_boxes[..., 3] = torch.exp(h.data) * anchor_h - - # fig = plt.figure() - # ax = fig.add_subplot(121) - # if input_height==13: - # plt.ylim(0,13) - # plt.xlim(0,13) - # elif input_height==26: - # plt.ylim(0,26) - # plt.xlim(0,26) - # elif input_height==52: - # plt.ylim(0,52) - # plt.xlim(0,52) - # plt.scatter(grid_x.cpu(),grid_y.cpu()) - - # anchor_left = grid_x - anchor_w/2 - # anchor_top = grid_y - anchor_h/2 - - # rect1 = plt.Rectangle([anchor_left[0,0,5,5],anchor_top[0,0,5,5]],anchor_w[0,0,5,5],anchor_h[0,0,5,5],color="r",fill=False) - # rect2 = plt.Rectangle([anchor_left[0,1,5,5],anchor_top[0,1,5,5]],anchor_w[0,1,5,5],anchor_h[0,1,5,5],color="r",fill=False) - # rect3 = plt.Rectangle([anchor_left[0,2,5,5],anchor_top[0,2,5,5]],anchor_w[0,2,5,5],anchor_h[0,2,5,5],color="r",fill=False) - - # ax.add_patch(rect1) - # ax.add_patch(rect2) - # ax.add_patch(rect3) - - # ax = fig.add_subplot(122) - # if input_height==13: - # plt.ylim(0,13) - # plt.xlim(0,13) - # elif input_height==26: - # plt.ylim(0,26) - # plt.xlim(0,26) - # elif input_height==52: - # plt.ylim(0,52) - # plt.xlim(0,52) - # plt.scatter(grid_x.cpu(),grid_y.cpu()) - # plt.scatter(pred_boxes[0,:,5,5,0].cpu(),pred_boxes[0,:,5,5,1].cpu(),c='r') - - # pre_left = pred_boxes[...,0] - pred_boxes[...,2]/2 - # pre_top = pred_boxes[...,1] - pred_boxes[...,3]/2 - - # rect1 = plt.Rectangle([pre_left[0,0,5,5],pre_top[0,0,5,5]],pred_boxes[0,0,5,5,2],pred_boxes[0,0,5,5,3],color="r",fill=False) - # rect2 = plt.Rectangle([pre_left[0,1,5,5],pre_top[0,1,5,5]],pred_boxes[0,1,5,5,2],pred_boxes[0,1,5,5,3],color="r",fill=False) - # rect3 = plt.Rectangle([pre_left[0,2,5,5],pre_top[0,2,5,5]],pred_boxes[0,2,5,5,2],pred_boxes[0,2,5,5,3],color="r",fill=False) - - # ax.add_patch(rect1) - # ax.add_patch(rect2) - # ax.add_patch(rect3) - - # plt.show() - - #----------------------------------------------------------# - # 将输出结果调整成相对于输入图像大小 - #----------------------------------------------------------# - _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor) - output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale, - conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1) - return output.data - -def letterbox_image(image, size): - iw, ih = image.size - w, h = size - scale = min(w/iw, h/ih) - nw = int(iw*scale) - nh = int(ih*scale) - - image = image.resize((nw,nh), Image.BICUBIC) - new_image = Image.new('RGB', size, (128,128,128)) - new_image.paste(image, ((w-nw)//2, (h-nh)//2)) - return new_image - -def yolo_correct_boxes(top, left, bottom, right, input_shape, image_shape): - new_shape = image_shape*np.min(input_shape/image_shape) - - offset = (input_shape-new_shape)/2./input_shape - scale = input_shape/new_shape - - box_yx = np.concatenate(((top+bottom)/2,(left+right)/2),axis=-1)/input_shape - box_hw = np.concatenate((bottom-top,right-left),axis=-1)/input_shape - - box_yx = (box_yx - offset) * scale - box_hw *= scale - box_mins = box_yx - (box_hw / 2.) - box_maxes = box_yx + (box_hw / 2.) - boxes = np.concatenate([ - box_mins[:, 0:1], - box_mins[:, 1:2], - box_maxes[:, 0:1], - box_maxes[:, 1:2] - ],axis=-1) - boxes *= np.concatenate([image_shape, image_shape],axis=-1) - return boxes - -def bbox_iou(box1, box2, x1y1x2y2=True): - """ - 计算IOU - """ - if not x1y1x2y2: - b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 - b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 - b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 - b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 +#---------------------------------------------------------# +# 将图像转换成RGB图像,防止灰度图在预测时报错。 +# 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB +#---------------------------------------------------------# +def cvtColor(image): + if len(np.shape(image)) == 3 and np.shape(image)[-2] == 3: + return image else: - b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] - b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] - - inter_rect_x1 = torch.max(b1_x1, b2_x1) - inter_rect_y1 = torch.max(b1_y1, b2_y1) - inter_rect_x2 = torch.min(b1_x2, b2_x2) - inter_rect_y2 = torch.min(b1_y2, b2_y2) - - inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * \ - torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0) - - b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) - b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) - - iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) - - return iou - - -def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4): - #----------------------------------------------------------# - # 将预测结果的格式转换成左上角右下角的格式。 - # prediction [batch_size, num_anchors, 85] - #----------------------------------------------------------# - box_corner = prediction.new(prediction.shape) - box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 - box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 - box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 - box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 - prediction[:, :, :4] = box_corner[:, :, :4] - - output = [None for _ in range(len(prediction))] - for image_i, image_pred in enumerate(prediction): - #----------------------------------------------------------# - # 对种类预测部分取max。 - # class_conf [num_anchors, 1] 种类置信度 - # class_pred [num_anchors, 1] 种类 - #----------------------------------------------------------# - class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) - - #----------------------------------------------------------# - # 利用置信度进行第一轮筛选 - #----------------------------------------------------------# - conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze() - - #----------------------------------------------------------# - # 根据置信度进行预测结果的筛选 - #----------------------------------------------------------# - image_pred = image_pred[conf_mask] - class_conf = class_conf[conf_mask] - class_pred = class_pred[conf_mask] - if not image_pred.size(0): - continue - #-------------------------------------------------------------------------# - # detections [num_anchors, 7] - # 7的内容为:x1, y1, x2, y2, obj_conf, class_conf, class_pred - #-------------------------------------------------------------------------# - detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1) - - #------------------------------------------# - # 获得预测结果中包含的所有种类 - #------------------------------------------# - unique_labels = detections[:, -1].cpu().unique() - - if prediction.is_cuda: - unique_labels = unique_labels.cuda() - detections = detections.cuda() - - for c in unique_labels: - #------------------------------------------# - # 获得某一类得分筛选后全部的预测结果 - #------------------------------------------# - detections_class = detections[detections[:, -1] == c] - - #------------------------------------------# - # 使用官方自带的非极大抑制会速度更快一些! - #------------------------------------------# - keep = nms( - detections_class[:, :4], - detections_class[:, 4] * detections_class[:, 5], - nms_thres - ) - max_detections = detections_class[keep] - - # # 按照存在物体的置信度排序 - # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True) - # detections_class = detections_class[conf_sort_index] - # # 进行非极大抑制 - # max_detections = [] - # while detections_class.size(0): - # # 取出这一类置信度最高的,一步一步往下判断,判断重合程度是否大于nms_thres,如果是则去除掉 - # max_detections.append(detections_class[0].unsqueeze(0)) - # if len(detections_class) == 1: - # break - # ious = bbox_iou(max_detections[-1], detections_class[1:]) - # detections_class = detections_class[1:][ious < nms_thres] - # # 堆叠 - # max_detections = torch.cat(max_detections).data - - # Add max detections to outputs - output[image_i] = max_detections if output[image_i] is None else torch.cat( - (output[image_i], max_detections)) - - return output - - -def merge_bboxes(bboxes, cutx, cuty): - merge_bbox = [] - for i in range(len(bboxes)): - for box in bboxes[i]: - tmp_box = [] - x1,y1,x2,y2 = box[0], box[1], box[2], box[3] - - if i == 0: - if y1 > cuty or x1 > cutx: - continue - if y2 >= cuty and y1 <= cuty: - y2 = cuty - if y2-y1 < 5: - continue - if x2 >= cutx and x1 <= cutx: - x2 = cutx - if x2-x1 < 5: - continue - - if i == 1: - if y2 < cuty or x1 > cutx: - continue - - if y2 >= cuty and y1 <= cuty: - y1 = cuty - if y2-y1 < 5: - continue - - if x2 >= cutx and x1 <= cutx: - x2 = cutx - if x2-x1 < 5: - continue - - if i == 2: - if y2 < cuty or x2 < cutx: - continue - - if y2 >= cuty and y1 <= cuty: - y1 = cuty - if y2-y1 < 5: - continue - - if x2 >= cutx and x1 <= cutx: - x1 = cutx - if x2-x1 < 5: - continue - - if i == 3: - if y1 > cuty or x2 < cutx: - continue - - if y2 >= cuty and y1 <= cuty: - y2 = cuty - if y2-y1 < 5: - continue - - if x2 >= cutx and x1 <= cutx: - x1 = cutx - if x2-x1 < 5: - continue + image = image.convert('RGB') + return image + +#---------------------------------------------------# +# 对输入图像进行resize +#---------------------------------------------------# +def resize_image(image, size, letterbox_image): + iw, ih = image.size + w, h = size + if letterbox_image: + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', size, (128,128,128)) + new_image.paste(image, ((w-nw)//2, (h-nh)//2)) + else: + new_image = image.resize((w, h), Image.BICUBIC) + return new_image - tmp_box.append(x1) - tmp_box.append(y1) - tmp_box.append(x2) - tmp_box.append(y2) - tmp_box.append(box[-1]) - merge_bbox.append(tmp_box) - return merge_bbox +#---------------------------------------------------# +# 获得类 +#---------------------------------------------------# +def get_classes(classes_path): + with open(classes_path, encoding='utf-8') as f: + class_names = f.readlines() + class_names = [c.strip() for c in class_names] + return class_names, len(class_names) + +#---------------------------------------------------# +# 获得先验框 +#---------------------------------------------------# +def get_anchors(anchors_path): + '''loads the anchors from a file''' + with open(anchors_path, encoding='utf-8') as f: + anchors = f.readline() + anchors = [float(x) for x in anchors.split(',')] + anchors = np.array(anchors).reshape(-1, 2) + return anchors, len(anchors) + +#---------------------------------------------------# +# 获得学习率 +#---------------------------------------------------# +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + +def preprocess_input(image): + image /= 255.0 + return image \ No newline at end of file diff --git a/utils/utils_bbox.py b/utils/utils_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..f5549c848dc324df1a7bfbeeb68526b9585ea586 --- /dev/null +++ b/utils/utils_bbox.py @@ -0,0 +1,227 @@ +import torch +import torch.nn as nn +from torchvision.ops import nms +import numpy as np + +class DecodeBox(): + def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]): + super(DecodeBox, self).__init__() + self.anchors = anchors + self.num_classes = num_classes + self.bbox_attrs = 5 + num_classes + self.input_shape = input_shape + #-----------------------------------------------------------# + # 13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401] + # 26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146] + # 52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28] + #-----------------------------------------------------------# + self.anchors_mask = anchors_mask + + def decode_box(self, inputs): + outputs = [] + for i, input in enumerate(inputs): + #-----------------------------------------------# + # 输入的input一共有三个,他们的shape分别是 + # batch_size, 255, 13, 13 + # batch_size, 255, 26, 26 + # batch_size, 255, 52, 52 + #-----------------------------------------------# + batch_size = input.size(0) + input_height = input.size(2) + input_width = input.size(3) + + #-----------------------------------------------# + # 输入为416x416时 + # stride_h = stride_w = 32、16、8 + #-----------------------------------------------# + stride_h = self.input_shape[0] / input_height + stride_w = self.input_shape[1] / input_width + #-------------------------------------------------# + # 此时获得的scaled_anchors大小是相对于特征层的 + #-------------------------------------------------# + scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors[self.anchors_mask[i]]] + + #-----------------------------------------------# + # 输入的input一共有三个,他们的shape分别是 + # batch_size, 3, 13, 13, 85 + # batch_size, 3, 26, 26, 85 + # batch_size, 3, 52, 52, 85 + #-----------------------------------------------# + prediction = input.view(batch_size, len(self.anchors_mask[i]), + self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous() + + #-----------------------------------------------# + # 先验框的中心位置的调整参数 + #-----------------------------------------------# + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + #-----------------------------------------------# + # 先验框的宽高调整参数 + #-----------------------------------------------# + w = prediction[..., 2] + h = prediction[..., 3] + #-----------------------------------------------# + # 获得置信度,是否有物体 + #-----------------------------------------------# + conf = torch.sigmoid(prediction[..., 4]) + #-----------------------------------------------# + # 种类置信度 + #-----------------------------------------------# + pred_cls = torch.sigmoid(prediction[..., 5:]) + + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + + #----------------------------------------------------------# + # 生成网格,先验框中心,网格左上角 + # batch_size,3,13,13 + #----------------------------------------------------------# + grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat( + batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor) + grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat( + batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor) + + #----------------------------------------------------------# + # 按照网格格式生成先验框的宽高 + # batch_size,3,13,13 + #----------------------------------------------------------# + anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) + anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) + anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape) + anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape) + + #----------------------------------------------------------# + # 利用预测结果对先验框进行调整 + # 首先调整先验框的中心,从先验框中心向右下角偏移 + # 再调整先验框的宽高。 + #----------------------------------------------------------# + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data + grid_x + pred_boxes[..., 1] = y.data + grid_y + pred_boxes[..., 2] = torch.exp(w.data) * anchor_w + pred_boxes[..., 3] = torch.exp(h.data) * anchor_h + + #----------------------------------------------------------# + # 将输出结果归一化成小数的形式 + #----------------------------------------------------------# + _scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor) + output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale, + conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1) + outputs.append(output.data) + return outputs + + def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image): + #-----------------------------------------------------------------# + # 把y轴放前面是因为方便预测框和图像的宽高进行相乘 + #-----------------------------------------------------------------# + box_yx = box_xy[..., ::-1] + box_hw = box_wh[..., ::-1] + input_shape = np.array(input_shape) + image_shape = np.array(image_shape) + + if letterbox_image: + #-----------------------------------------------------------------# + # 这里求出来的offset是图像有效区域相对于图像左上角的偏移情况 + # new_shape指的是宽高缩放情况 + #-----------------------------------------------------------------# + new_shape = np.round(image_shape * np.min(input_shape/image_shape)) + offset = (input_shape - new_shape)/2./input_shape + scale = input_shape/new_shape + + box_yx = (box_yx - offset) * scale + box_hw *= scale + + box_mins = box_yx - (box_hw / 2.) + box_maxes = box_yx + (box_hw / 2.) + boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1) + boxes *= np.concatenate([image_shape, image_shape], axis=-1) + return boxes + + def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4): + #----------------------------------------------------------# + # 将预测结果的格式转换成左上角右下角的格式。 + # prediction [batch_size, num_anchors, 85] + #----------------------------------------------------------# + box_corner = prediction.new(prediction.shape) + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + prediction[:, :, :4] = box_corner[:, :, :4] + + output = [None for _ in range(len(prediction))] + for i, image_pred in enumerate(prediction): + #----------------------------------------------------------# + # 对种类预测部分取max。 + # class_conf [num_anchors, 1] 种类置信度 + # class_pred [num_anchors, 1] 种类 + #----------------------------------------------------------# + class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True) + + #----------------------------------------------------------# + # 利用置信度进行第一轮筛选 + #----------------------------------------------------------# + conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze() + + #----------------------------------------------------------# + # 根据置信度进行预测结果的筛选 + #----------------------------------------------------------# + image_pred = image_pred[conf_mask] + class_conf = class_conf[conf_mask] + class_pred = class_pred[conf_mask] + if not image_pred.size(0): + continue + #-------------------------------------------------------------------------# + # detections [num_anchors, 7] + # 7的内容为:x1, y1, x2, y2, obj_conf, class_conf, class_pred + #-------------------------------------------------------------------------# + detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1) + + #------------------------------------------# + # 获得预测结果中包含的所有种类 + #------------------------------------------# + unique_labels = detections[:, -1].cpu().unique() + + if prediction.is_cuda: + unique_labels = unique_labels.cuda() + detections = detections.cuda() + + for c in unique_labels: + #------------------------------------------# + # 获得某一类得分筛选后全部的预测结果 + #------------------------------------------# + detections_class = detections[detections[:, -1] == c] + + #------------------------------------------# + # 使用官方自带的非极大抑制会速度更快一些! + #------------------------------------------# + keep = nms( + detections_class[:, :4], + detections_class[:, 4] * detections_class[:, 5], + nms_thres + ) + max_detections = detections_class[keep] + + # # 按照存在物体的置信度排序 + # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True) + # detections_class = detections_class[conf_sort_index] + # # 进行非极大抑制 + # max_detections = [] + # while detections_class.size(0): + # # 取出这一类置信度最高的,一步一步往下判断,判断重合程度是否大于nms_thres,如果是则去除掉 + # max_detections.append(detections_class[0].unsqueeze(0)) + # if len(detections_class) == 1: + # break + # ious = bbox_iou(max_detections[-1], detections_class[1:]) + # detections_class = detections_class[1:][ious < nms_thres] + # # 堆叠 + # max_detections = torch.cat(max_detections).data + + # Add max detections to outputs + output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections)) + + if output[i] is not None: + output[i] = output[i].cpu().numpy() + box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2] + output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image) + return output diff --git a/utils/utils_fit.py b/utils/utils_fit.py new file mode 100644 index 0000000000000000000000000000000000000000..53dcc334d53824522f73b5894bc9cfa18e318c30 --- /dev/null +++ b/utils/utils_fit.py @@ -0,0 +1,102 @@ +import torch +from tqdm import tqdm + +from utils.utils import get_lr + +def fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda): + loss = 0 + val_loss = 0 + + model_train.train() + print('Start Train') + with tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: + for iteration, batch in enumerate(gen): + if iteration >= epoch_step: + break + + images, targets = batch[0], batch[1] + with torch.no_grad(): + if cuda: + images = torch.from_numpy(images).type(torch.FloatTensor).cuda() + targets = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in targets] + else: + images = torch.from_numpy(images).type(torch.FloatTensor) + targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets] + #----------------------# + # 清零梯度 + #----------------------# + optimizer.zero_grad() + #----------------------# + # 前向传播 + #----------------------# + outputs = model_train(images) + + loss_value_all = 0 + num_pos_all = 0 + #----------------------# + # 计算损失 + #----------------------# + for l in range(len(outputs)): + loss_item, num_pos = yolo_loss(l, outputs[l], targets) + loss_value_all += loss_item + num_pos_all += num_pos + loss_value = loss_value_all / num_pos_all + + #----------------------# + # 反向传播 + #----------------------# + loss_value.backward() + optimizer.step() + + loss += loss_value.item() + + pbar.set_postfix(**{'loss' : loss / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + print('Finish Train') + + model_train.eval() + print('Start Validation') + with tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar: + for iteration, batch in enumerate(gen_val): + if iteration >= epoch_step_val: + break + images, targets = batch[0], batch[1] + with torch.no_grad(): + if cuda: + images = torch.from_numpy(images).type(torch.FloatTensor).cuda() + targets = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in targets] + else: + images = torch.from_numpy(images).type(torch.FloatTensor) + targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets] + #----------------------# + # 清零梯度 + #----------------------# + optimizer.zero_grad() + #----------------------# + # 前向传播 + #----------------------# + outputs = model_train(images) + + loss_value_all = 0 + num_pos_all = 0 + #----------------------# + # 计算损失 + #----------------------# + for l in range(len(outputs)): + loss_item, num_pos = yolo_loss(l, outputs[l], targets) + loss_value_all += loss_item + num_pos_all += num_pos + loss_value = loss_value_all / num_pos_all + + val_loss += loss_value.item() + pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)}) + pbar.update(1) + + print('Finish Validation') + + loss_history.append_loss(loss / epoch_step, val_loss / epoch_step_val) + print('Epoch:'+ str(epoch+1) + '/' + str(Epoch)) + print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val)) + torch.save(model.state_dict(), 'logs/ep%03d-loss%.3f-val_loss%.3f.pth' % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val)) diff --git a/utils/utils_map.py b/utils/utils_map.py new file mode 100644 index 0000000000000000000000000000000000000000..45aba747fdcd742b7098b31029dd8df9384be699 --- /dev/null +++ b/utils/utils_map.py @@ -0,0 +1,897 @@ +import glob +import json +import math +import operator +import os +import shutil +import sys + +import cv2 +import matplotlib.pyplot as plt +import numpy as np + +''' + 0,0 ------> x (width) + | + | (Left,Top) + | *_________ + | | | + | | + y |_________| + (height) * + (Right,Bottom) +''' + +def log_average_miss_rate(precision, fp_cumsum, num_images): + """ + log-average miss rate: + Calculated by averaging miss rates at 9 evenly spaced FPPI points + between 10e-2 and 10e0, in log-space. + + output: + lamr | log-average miss rate + mr | miss rate + fppi | false positives per image + + references: + [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the + State of the Art." Pattern Analysis and Machine Intelligence, IEEE + Transactions on 34.4 (2012): 743 - 761. + """ + + if precision.size == 0: + lamr = 0 + mr = 1 + fppi = 0 + return lamr, mr, fppi + + fppi = fp_cumsum / float(num_images) + mr = (1 - precision) + + fppi_tmp = np.insert(fppi, 0, -1.0) + mr_tmp = np.insert(mr, 0, 1.0) + + ref = np.logspace(-2.0, 0.0, num = 9) + for i, ref_i in enumerate(ref): + j = np.where(fppi_tmp <= ref_i)[-1][-1] + ref[i] = mr_tmp[j] + + lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) + + return lamr, mr, fppi + +""" + throw error and exit +""" +def error(msg): + print(msg) + sys.exit(0) + +""" + check if the number is a float between 0.0 and 1.0 +""" +def is_float_between_0_and_1(value): + try: + val = float(value) + if val > 0.0 and val < 1.0: + return True + else: + return False + except ValueError: + return False + +""" + Calculate the AP given the recall and precision array + 1st) We compute a version of the measured precision/recall curve with + precision monotonically decreasing + 2nd) We compute the AP as the area under this curve by numerical integration. +""" +def voc_ap(rec, prec): + """ + --- Official matlab code VOC2012--- + mrec=[0 ; rec ; 1]; + mpre=[0 ; prec ; 0]; + for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + end + i=find(mrec(2:end)~=mrec(1:end-1))+1; + ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + rec.insert(0, 0.0) # insert 0.0 at begining of list + rec.append(1.0) # insert 1.0 at end of list + mrec = rec[:] + prec.insert(0, 0.0) # insert 0.0 at begining of list + prec.append(0.0) # insert 0.0 at end of list + mpre = prec[:] + """ + This part makes the precision monotonically decreasing + (goes from the end to the beginning) + matlab: for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + """ + for i in range(len(mpre)-2, -1, -1): + mpre[i] = max(mpre[i], mpre[i+1]) + """ + This part creates a list of indexes where the recall changes + matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; + """ + i_list = [] + for i in range(1, len(mrec)): + if mrec[i] != mrec[i-1]: + i_list.append(i) # if it was matlab would be i + 1 + """ + The Average Precision (AP) is the area under the curve + (numerical integration) + matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + ap = 0.0 + for i in i_list: + ap += ((mrec[i]-mrec[i-1])*mpre[i]) + return ap, mrec, mpre + + +""" + Convert the lines of a file to a list +""" +def file_lines_to_list(path): + # open txt file lines to a list + with open(path) as f: + content = f.readlines() + # remove whitespace characters like `\n` at the end of each line + content = [x.strip() for x in content] + return content + +""" + Draws text in image +""" +def draw_text_in_image(img, text, pos, color, line_width): + font = cv2.FONT_HERSHEY_PLAIN + fontScale = 1 + lineType = 1 + bottomLeftCornerOfText = pos + cv2.putText(img, text, + bottomLeftCornerOfText, + font, + fontScale, + color, + lineType) + text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] + return img, (line_width + text_width) + +""" + Plot - adjust axes +""" +def adjust_axes(r, t, fig, axes): + # get text width for re-scaling + bb = t.get_window_extent(renderer=r) + text_width_inches = bb.width / fig.dpi + # get axis width in inches + current_fig_width = fig.get_figwidth() + new_fig_width = current_fig_width + text_width_inches + propotion = new_fig_width / current_fig_width + # get axis limit + x_lim = axes.get_xlim() + axes.set_xlim([x_lim[0], x_lim[1]*propotion]) + +""" + Draw plot using Matplotlib +""" +def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): + # sort the dictionary by decreasing value, into a list of tuples + sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) + # unpacking the list of tuples into two lists + sorted_keys, sorted_values = zip(*sorted_dic_by_value) + # + if true_p_bar != "": + """ + Special case to draw in: + - green -> TP: True Positives (object detected and matches ground-truth) + - red -> FP: False Positives (object detected but does not match ground-truth) + - orange -> FN: False Negatives (object not detected but present in the ground-truth) + """ + fp_sorted = [] + tp_sorted = [] + for key in sorted_keys: + fp_sorted.append(dictionary[key] - true_p_bar[key]) + tp_sorted.append(true_p_bar[key]) + plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') + plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) + # add legend + plt.legend(loc='lower right') + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + fp_val = fp_sorted[i] + tp_val = tp_sorted[i] + fp_str_val = " " + str(fp_val) + tp_str_val = fp_str_val + " " + str(tp_val) + # trick to paint multicolor with offset: + # first paint everything and then repaint the first number + t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') + plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + else: + plt.barh(range(n_classes), sorted_values, color=plot_color) + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + str_val = " " + str(val) # add a space before + if val < 1.0: + str_val = " {0:.2f}".format(val) + t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') + # re-set axes to show number inside the figure + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + # set window title + fig.canvas.set_window_title(window_title) + # write classes in y axis + tick_font_size = 12 + plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) + """ + Re-scale height accordingly + """ + init_height = fig.get_figheight() + # comput the matrix height in points and inches + dpi = fig.dpi + height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) + height_in = height_pt / dpi + # compute the required figure height + top_margin = 0.15 # in percentage of the figure height + bottom_margin = 0.05 # in percentage of the figure height + figure_height = height_in / (1 - top_margin - bottom_margin) + # set new height + if figure_height > init_height: + fig.set_figheight(figure_height) + + # set plot title + plt.title(plot_title, fontsize=14) + # set axis titles + # plt.xlabel('classes') + plt.xlabel(x_label, fontsize='large') + # adjust size of window + fig.tight_layout() + # save the plot + fig.savefig(output_path) + # show image + if to_show: + plt.show() + # close the plot + plt.close() + +def get_map(MINOVERLAP, draw_plot, path = './map_out'): + GT_PATH = os.path.join(path, 'ground-truth') + DR_PATH = os.path.join(path, 'detection-results') + IMG_PATH = os.path.join(path, 'images-optional') + TEMP_FILES_PATH = os.path.join(path, '.temp_files') + RESULTS_FILES_PATH = os.path.join(path, 'results') + + show_animation = True + if os.path.exists(IMG_PATH): + for dirpath, dirnames, files in os.walk(IMG_PATH): + if not files: + show_animation = False + else: + show_animation = False + + if not os.path.exists(TEMP_FILES_PATH): + os.makedirs(TEMP_FILES_PATH) + + if os.path.exists(RESULTS_FILES_PATH): + shutil.rmtree(RESULTS_FILES_PATH) + if draw_plot: + os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision")) + if show_animation: + os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one")) + + ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') + if len(ground_truth_files_list) == 0: + error("Error: No ground-truth files found!") + ground_truth_files_list.sort() + gt_counter_per_class = {} + counter_images_per_class = {} + + for txt_file in ground_truth_files_list: + file_id = txt_file.split(".txt", 1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(DR_PATH, (file_id + ".txt")) + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error(error_msg) + lines_list = file_lines_to_list(txt_file) + bounding_boxes = [] + is_difficult = False + already_seen_classes = [] + for line in lines_list: + try: + if "difficult" in line: + class_name, left, top, right, bottom, _difficult = line.split() + is_difficult = True + else: + class_name, left, top, right, bottom = line.split() + except: + if "difficult" in line: + line_split = line.split() + _difficult = line_split[-1] + bottom = line_split[-2] + right = line_split[-3] + top = line_split[-4] + left = line_split[-5] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + is_difficult = True + else: + line_split = line.split() + bottom = line_split[-1] + right = line_split[-2] + top = line_split[-3] + left = line_split[-4] + class_name = "" + for name in line_split[:-4]: + class_name += name + " " + class_name = class_name[:-1] + + bbox = left + " " + top + " " + right + " " + bottom + if is_difficult: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) + is_difficult = False + else: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) + if class_name in gt_counter_per_class: + gt_counter_per_class[class_name] += 1 + else: + gt_counter_per_class[class_name] = 1 + + if class_name not in already_seen_classes: + if class_name in counter_images_per_class: + counter_images_per_class[class_name] += 1 + else: + counter_images_per_class[class_name] = 1 + already_seen_classes.append(class_name) + + with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + + gt_classes = list(gt_counter_per_class.keys()) + gt_classes = sorted(gt_classes) + n_classes = len(gt_classes) + + dr_files_list = glob.glob(DR_PATH + '/*.txt') + dr_files_list.sort() + for class_index, class_name in enumerate(gt_classes): + bounding_boxes = [] + for txt_file in dr_files_list: + file_id = txt_file.split(".txt",1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(GT_PATH, (file_id + ".txt")) + if class_index == 0: + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error(error_msg) + lines = file_lines_to_list(txt_file) + for line in lines: + try: + tmp_class_name, confidence, left, top, right, bottom = line.split() + except: + line_split = line.split() + bottom = line_split[-1] + right = line_split[-2] + top = line_split[-3] + left = line_split[-4] + confidence = line_split[-5] + tmp_class_name = "" + for name in line_split[:-5]: + tmp_class_name += name + " " + tmp_class_name = tmp_class_name[:-1] + + if tmp_class_name == class_name: + bbox = left + " " + top + " " + right + " " +bottom + bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) + + bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) + with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + + sum_AP = 0.0 + ap_dictionary = {} + lamr_dictionary = {} + with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file: + results_file.write("# AP and precision/recall per class\n") + count_true_positives = {} + + for class_index, class_name in enumerate(gt_classes): + count_true_positives[class_name] = 0 + dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" + dr_data = json.load(open(dr_file)) + + nd = len(dr_data) + tp = [0] * nd + fp = [0] * nd + score = [0] * nd + score05_idx = 0 + for idx, detection in enumerate(dr_data): + file_id = detection["file_id"] + score[idx] = float(detection["confidence"]) + if score[idx] > 0.5: + score05_idx = idx + + if show_animation: + ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") + if len(ground_truth_img) == 0: + error("Error. Image not found with id: " + file_id) + elif len(ground_truth_img) > 1: + error("Error. Multiple image with id: " + file_id) + else: + img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) + img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0] + if os.path.isfile(img_cumulative_path): + img_cumulative = cv2.imread(img_cumulative_path) + else: + img_cumulative = img.copy() + bottom_border = 60 + BLACK = [0, 0, 0] + img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) + + gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" + ground_truth_data = json.load(open(gt_file)) + ovmax = -1 + gt_match = -1 + bb = [float(x) for x in detection["bbox"].split()] + for obj in ground_truth_data: + if obj["class_name"] == class_name: + bbgt = [ float(x) for x in obj["bbox"].split() ] + bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] + iw = bi[2] - bi[0] + 1 + ih = bi[3] - bi[1] + 1 + if iw > 0 and ih > 0: + ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] + + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih + ov = iw * ih / ua + if ov > ovmax: + ovmax = ov + gt_match = obj + + if show_animation: + status = "NO MATCH FOUND!" + + min_overlap = MINOVERLAP + if ovmax >= min_overlap: + if "difficult" not in gt_match: + if not bool(gt_match["used"]): + tp[idx] = 1 + gt_match["used"] = True + count_true_positives[class_name] += 1 + with open(gt_file, 'w') as f: + f.write(json.dumps(ground_truth_data)) + if show_animation: + status = "MATCH!" + else: + fp[idx] = 1 + if show_animation: + status = "REPEATED MATCH!" + else: + fp[idx] = 1 + if ovmax > 0: + status = "INSUFFICIENT OVERLAP" + + """ + Draw image to show animation + """ + if show_animation: + height, widht = img.shape[:2] + white = (255,255,255) + light_blue = (255,200,100) + green = (0,255,0) + light_red = (30,30,255) + margin = 10 + # 1nd line + v_pos = int(height - margin - (bottom_border / 2.0)) + text = "Image: " + ground_truth_img[0] + " " + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) + if ovmax != -1: + color = light_red + if status == "INSUFFICIENT OVERLAP": + text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) + else: + text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) + color = green + img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + # 2nd line + v_pos += int(bottom_border / 2.0) + rank_pos = str(idx+1) + text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + color = light_red + if status == "MATCH!": + color = green + text = "Result: " + status + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + + font = cv2.FONT_HERSHEY_SIMPLEX + if ovmax > 0: + bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] + cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) + bb = [int(i) for i in bb] + cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) + + cv2.imshow("Animation", img) + cv2.waitKey(20) + output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" + cv2.imwrite(output_img_path, img) + cv2.imwrite(img_cumulative_path, img_cumulative) + + cumsum = 0 + for idx, val in enumerate(fp): + fp[idx] += cumsum + cumsum += val + + cumsum = 0 + for idx, val in enumerate(tp): + tp[idx] += cumsum + cumsum += val + + rec = tp[:] + for idx, val in enumerate(tp): + rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1) + + prec = tp[:] + for idx, val in enumerate(tp): + prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1) + + ap, mrec, mprec = voc_ap(rec[:], prec[:]) + F1 = np.array(rec)*np.array(prec)*2 / np.where((np.array(prec)+np.array(rec))==0, 1, (np.array(prec)+np.array(rec))) + + sum_AP += ap + text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) + + if len(prec)>0: + F1_text = "{0:.2f}".format(F1[score05_idx]) + " = " + class_name + " F1 " + Recall_text = "{0:.2f}%".format(rec[score05_idx]*100) + " = " + class_name + " Recall " + Precision_text = "{0:.2f}%".format(prec[score05_idx]*100) + " = " + class_name + " Precision " + else: + F1_text = "0.00" + " = " + class_name + " F1 " + Recall_text = "0.00%" + " = " + class_name + " Recall " + Precision_text = "0.00%" + " = " + class_name + " Precision " + + rounded_prec = [ '%.2f' % elem for elem in prec ] + rounded_rec = [ '%.2f' % elem for elem in rec ] + results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") + if len(prec)>0: + print(text + "\t||\tscore_threhold=0.5 : " + "F1=" + "{0:.2f}".format(F1[score05_idx])\ + + " ; Recall=" + "{0:.2f}%".format(rec[score05_idx]*100) + " ; Precision=" + "{0:.2f}%".format(prec[score05_idx]*100)) + else: + print(text + "\t||\tscore_threhold=0.5 : F1=0.00% ; Recall=0.00% ; Precision=0.00%") + ap_dictionary[class_name] = ap + + n_images = counter_images_per_class[class_name] + lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images) + lamr_dictionary[class_name] = lamr + + if draw_plot: + plt.plot(rec, prec, '-o') + area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] + area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] + plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') + + fig = plt.gcf() + fig.canvas.set_window_title('AP ' + class_name) + + plt.title('class: ' + text) + plt.xlabel('Recall') + plt.ylabel('Precision') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png") + plt.cla() + + plt.plot(score, F1, "-", color='orangered') + plt.title('class: ' + F1_text + "\nscore_threhold=0.5") + plt.xlabel('Score_Threhold') + plt.ylabel('F1') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png") + plt.cla() + + plt.plot(score, rec, "-H", color='gold') + plt.title('class: ' + Recall_text + "\nscore_threhold=0.5") + plt.xlabel('Score_Threhold') + plt.ylabel('Recall') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png") + plt.cla() + + plt.plot(score, prec, "-s", color='palevioletred') + plt.title('class: ' + Precision_text + "\nscore_threhold=0.5") + plt.xlabel('Score_Threhold') + plt.ylabel('Precision') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png") + plt.cla() + + if show_animation: + cv2.destroyAllWindows() + + results_file.write("\n# mAP of all classes\n") + mAP = sum_AP / n_classes + text = "mAP = {0:.2f}%".format(mAP*100) + results_file.write(text + "\n") + print(text) + + shutil.rmtree(TEMP_FILES_PATH) + + """ + Count total of detection-results + """ + det_counter_per_class = {} + for txt_file in dr_files_list: + lines_list = file_lines_to_list(txt_file) + for line in lines_list: + class_name = line.split()[0] + if class_name in det_counter_per_class: + det_counter_per_class[class_name] += 1 + else: + det_counter_per_class[class_name] = 1 + dr_classes = list(det_counter_per_class.keys()) + + """ + Write number of ground-truth objects per class to results.txt + """ + with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of ground-truth objects per class\n") + for class_name in sorted(gt_counter_per_class): + results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") + + """ + Finish counting true positives + """ + for class_name in dr_classes: + if class_name not in gt_classes: + count_true_positives[class_name] = 0 + + """ + Write number of detected objects per class to results.txt + """ + with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of detected objects per class\n") + for class_name in sorted(dr_classes): + n_det = det_counter_per_class[class_name] + text = class_name + ": " + str(n_det) + text += " (tp:" + str(count_true_positives[class_name]) + "" + text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" + results_file.write(text) + + """ + Plot the total number of occurences of each class in the ground-truth + """ + if draw_plot: + window_title = "ground-truth-info" + plot_title = "ground-truth\n" + plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" + x_label = "Number of objects per class" + output_path = RESULTS_FILES_PATH + "/ground-truth-info.png" + to_show = False + plot_color = 'forestgreen' + draw_plot_func( + gt_counter_per_class, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + '', + ) + + """ + Plot the total number of occurences of each class in the "detection-results" folder + """ + if draw_plot: + window_title = "detection-results-info" + # Plot title + plot_title = "detection-results\n" + plot_title += "(" + str(len(dr_files_list)) + " files and " + count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) + plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" + # end Plot title + x_label = "Number of objects per class" + output_path = RESULTS_FILES_PATH + "/detection-results-info.png" + to_show = False + plot_color = 'forestgreen' + true_p_bar = count_true_positives + draw_plot_func( + det_counter_per_class, + len(det_counter_per_class), + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + true_p_bar + ) + + """ + Draw log-average miss rate plot (Show lamr of all classes in decreasing order) + """ + if draw_plot: + window_title = "lamr" + plot_title = "log-average miss rate" + x_label = "log-average miss rate" + output_path = RESULTS_FILES_PATH + "/lamr.png" + to_show = False + plot_color = 'royalblue' + draw_plot_func( + lamr_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + + """ + Draw mAP plot (Show AP's of all classes in decreasing order) + """ + if draw_plot: + window_title = "mAP" + plot_title = "mAP = {0:.2f}%".format(mAP*100) + x_label = "Average Precision" + output_path = RESULTS_FILES_PATH + "/mAP.png" + to_show = True + plot_color = 'royalblue' + draw_plot_func( + ap_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + +def preprocess_gt(gt_path, class_names): + image_ids = os.listdir(gt_path) + results = {} + + images = [] + bboxes = [] + for i, image_id in enumerate(image_ids): + lines_list = file_lines_to_list(os.path.join(gt_path, image_id)) + boxes_per_image = [] + image = {} + image_id = os.path.splitext(image_id)[0] + image['file_name'] = image_id + '.jpg' + image['width'] = 1 + image['height'] = 1 + image['id'] = i + + for line in lines_list: + difficult = 0 + if "difficult" in line: + line_split = line.split() + left, top, right, bottom, _difficult = line_split[-5:] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + difficult = 1 + else: + line_split = line.split() + left, top, right, bottom = line_split[-4:] + class_name = "" + for name in line_split[:-4]: + class_name += name + " " + class_name = class_name[:-1] + + left, top, right, bottom = float(left), float(top), float(right), float(bottom) + cls_id = class_names.index(class_name) + 1 + bbox = [left, top, right - left, bottom - top, difficult, int(image_id), cls_id, (right - left) * (bottom - top) - 10.0] + boxes_per_image.append(bbox) + images.append(image) + bboxes.extend(boxes_per_image) + results['images'] = images + + categories = [] + for i, cls in enumerate(class_names): + category = {} + category['supercategory'] = cls + category['name'] = cls + category['id'] = i + categories.append(category) + results['categories'] = categories + + annotations = [] + for i, box in enumerate(bboxes): + annotation = {} + annotation['area'] = box[-1] + annotation['category_id'] = box[-2] + annotation['image_id'] = box[-3] + annotation['iscrowd'] = box[-4] + annotation['bbox'] = box[:4] + annotation['id'] = i + annotations.append(annotation) + results['annotations'] = annotations + return results + +def preprocess_dr(dr_path, class_names): + image_ids = os.listdir(dr_path) + results = [] + for image_id in image_ids: + lines_list = file_lines_to_list(os.path.join(dr_path, image_id)) + image_id = os.path.splitext(image_id)[0] + for line in lines_list: + line_split = line.split() + confidence, left, top, right, bottom = line_split[-5:] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + left, top, right, bottom = float(left), float(top), float(right), float(bottom) + result = {} + result["image_id"] = int(image_id) + result["category_id"] = class_names.index(class_name) + 1 + result["bbox"] = [left, top, right - left, bottom - top] + result["score"] = float(confidence) + results.append(result) + return results + +def get_coco_map(class_names, path): + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + GT_PATH = os.path.join(path, 'ground-truth') + DR_PATH = os.path.join(path, 'detection-results') + COCO_PATH = os.path.join(path, 'coco_eval') + + if not os.path.exists(COCO_PATH): + os.makedirs(COCO_PATH) + + GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json') + DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json') + + with open(GT_JSON_PATH, "w") as f: + results_gt = preprocess_gt(GT_PATH, class_names) + json.dump(results_gt, f, indent=4) + + with open(DR_JSON_PATH, "w") as f: + results_dr = preprocess_dr(DR_PATH, class_names) + json.dump(results_dr, f, indent=4) + + cocoGt = COCO(GT_JSON_PATH) + cocoDt = cocoGt.loadRes(DR_JSON_PATH) + cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() diff --git a/voc_annotation.py b/voc_annotation.py index 324ef31547a18b6e129d87eac1aee92825b6df72..25efc8177c9f6ed848e9e73407960f48b90d4967 100644 --- a/voc_annotation.py +++ b/voc_annotation.py @@ -1,19 +1,42 @@ -#---------------------------------------------# -# 运行前一定要修改classes +import os +import random +import xml.etree.ElementTree as ET + +from utils.utils import get_classes + +#--------------------------------------------------------------------------------------------------------------------------------# +# annotation_mode用于指定该文件运行时计算的内容 +# annotation_mode为0代表整个标签处理过程,包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt +# annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt +# annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt +#--------------------------------------------------------------------------------------------------------------------------------# +annotation_mode = 0 +#-------------------------------------------------------------------# +# 必须要修改,用于生成2007_train.txt、2007_val.txt的目标信息 +# 与训练和预测所用的classes_path一致即可 # 如果生成的2007_train.txt里面没有目标信息 # 那么就是因为classes没有设定正确 -#---------------------------------------------# -import xml.etree.ElementTree as ET -from os import getcwd +# 仅在annotation_mode为0和2的时候有效 +#-------------------------------------------------------------------# +classes_path = 'model_data/voc_classes.txt' +#--------------------------------------------------------------------------------------------------------------------------------# +# trainval_percent用于指定(训练集+验证集)与测试集的比例,默认情况下 (训练集+验证集):测试集 = 9:1 +# train_percent用于指定(训练集+验证集)中训练集与验证集的比例,默认情况下 训练集:验证集 = 9:1 +# 仅在annotation_mode为0和1的时候有效 +#--------------------------------------------------------------------------------------------------------------------------------# +trainval_percent = 0.9 +train_percent = 0.9 +#-------------------------------------------------------# +# 指向VOC数据集所在的文件夹 +# 默认指向根目录下的VOC数据集 +#-------------------------------------------------------# +VOCdevkit_path = 'VOCdevkit' -sets=[('2007', 'train'), ('2007', 'val'), ('2007', 'test')] -#-----------------------------------------------------# -# 这里设定的classes顺序要和model_data里的txt一样 -#-----------------------------------------------------# -classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] +VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')] +classes, _ = get_classes(classes_path) def convert_annotation(year, image_id, list_file): - in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id), encoding='utf-8') + in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml'%(year, image_id)), encoding='utf-8') tree=ET.parse(in_file) root = tree.getroot() @@ -28,14 +51,59 @@ def convert_annotation(year, image_id, list_file): xmlbox = obj.find('bndbox') b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text))) list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id)) + +if __name__ == "__main__": + random.seed(0) + if annotation_mode == 0 or annotation_mode == 1: + print("Generate txt in ImageSets.") + xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations') + saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main') + temp_xml = os.listdir(xmlfilepath) + total_xml = [] + for xml in temp_xml: + if xml.endswith(".xml"): + total_xml.append(xml) + + num = len(total_xml) + list = range(num) + tv = int(num*trainval_percent) + tr = int(tv*train_percent) + trainval= random.sample(list,tv) + train = random.sample(trainval,tr) + + print("train and val size",tv) + print("train size",tr) + ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w') + ftest = open(os.path.join(saveBasePath,'test.txt'), 'w') + ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w') + fval = open(os.path.join(saveBasePath,'val.txt'), 'w') + + for i in list: + name=total_xml[i][:-4]+'\n' + if i in trainval: + ftrainval.write(name) + if i in train: + ftrain.write(name) + else: + fval.write(name) + else: + ftest.write(name) + + ftrainval.close() + ftrain.close() + fval.close() + ftest.close() + print("Generate txt in ImageSets done.") -wd = getcwd() + if annotation_mode == 0 or annotation_mode == 2: + print("Generate 2007_train.txt and 2007_val.txt for train.") + for year, image_set in VOCdevkit_sets: + image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt'%(year, image_set)), encoding='utf-8').read().strip().split() + list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8') + for image_id in image_ids: + list_file.write('%s/VOC%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), year, image_id)) -for year, image_set in sets: - image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set), encoding='utf-8').read().strip().split() - list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8') - for image_id in image_ids: - list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg'%(wd, year, image_id)) - convert_annotation(year, image_id, list_file) - list_file.write('\n') - list_file.close() + convert_annotation(year, image_id, list_file) + list_file.write('\n') + list_file.close() + print("Generate 2007_train.txt and 2007_val.txt for train done.") diff --git a/yolo.py b/yolo.py index 4b513dde39339b201d0af4d2a308c3756a43e49e..e29d2dd81f5531e829e6cb831b88a6022bd4572f 100644 --- a/yolo.py +++ b/yolo.py @@ -1,6 +1,3 @@ -#-------------------------------------# -# 创建YOLO类 -#-------------------------------------# import colorsys import os import time @@ -8,33 +5,53 @@ import time import numpy as np import torch import torch.nn as nn -from PIL import Image, ImageDraw, ImageFont +from PIL import ImageDraw, ImageFont -from nets.yolo4 import YoloBody -from utils.utils import (DecodeBox, letterbox_image, non_max_suppression, - yolo_correct_boxes) +from nets.yolo import YoloBody +from utils.utils import (cvtColor, get_anchors, get_classes, preprocess_input, + resize_image) +from utils.utils_bbox import DecodeBox - -#--------------------------------------------# -# 使用自己训练好的模型预测需要修改2个参数 -# model_path和classes_path都需要修改! -# 如果出现shape不匹配,一定要注意 -# 训练时的model_path和classes_path参数的修改 -#--------------------------------------------# +''' +训练自己的数据集必看注释! +''' class YOLO(object): _defaults = { + #--------------------------------------------------------------------------# + # 使用自己训练好的模型进行预测一定要修改model_path和classes_path! + # model_path指向logs文件夹下的权值文件,classes_path指向model_data下的txt + # 如果出现shape不匹配,同时要注意训练时的model_path和classes_path参数的修改 + #--------------------------------------------------------------------------# "model_path" : 'model_data/yolo4_weights.pth', - "anchors_path" : 'model_data/yolo_anchors.txt', "classes_path" : 'model_data/coco_classes.txt', - "model_image_size" : (416, 416, 3), + #---------------------------------------------------------------------# + # anchors_path代表先验框对应的txt文件,一般不修改。 + # anchors_mask用于帮助代码找到对应的先验框,一般不修改。 + #---------------------------------------------------------------------# + "anchors_path" : 'model_data/yolo_anchors.txt', + "anchors_mask" : [[6, 7, 8], [3, 4, 5], [0, 1, 2]], + #---------------------------------------------------------------------# + # 输入图片的大小,必须为32的倍数。 + #---------------------------------------------------------------------# + "input_shape" : [416, 416], + #---------------------------------------------------------------------# + # 只有得分大于置信度的预测框会被保留下来 + #---------------------------------------------------------------------# "confidence" : 0.5, - "iou" : 0.3, - "cuda" : True, + #---------------------------------------------------------------------# + # 非极大抑制所用到的nms_iou大小 + #---------------------------------------------------------------------# + "nms_iou" : 0.3, #---------------------------------------------------------------------# # 该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize, # 在多次测试后,发现关闭letterbox_image直接resize的效果更好 #---------------------------------------------------------------------# "letterbox_image" : False, + #-------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #-------------------------------# + "cuda" : True, } @classmethod @@ -49,165 +66,106 @@ class YOLO(object): #---------------------------------------------------# def __init__(self, **kwargs): self.__dict__.update(self._defaults) - self.class_names = self._get_class() - self.anchors = self._get_anchors() - self.generate() + for name, value in kwargs.items(): + setattr(self, name, value) + + #---------------------------------------------------# + # 获得种类和先验框的数量 + #---------------------------------------------------# + self.class_names, self.num_classes = get_classes(self.classes_path) + self.anchors, self.num_anchors = get_anchors(self.anchors_path) + self.bbox_util = DecodeBox(self.anchors, self.num_classes, (self.input_shape[0], self.input_shape[1]), self.anchors_mask) - #---------------------------------------------------# - # 获得所有的分类 - #---------------------------------------------------# - def _get_class(self): - classes_path = os.path.expanduser(self.classes_path) - with open(classes_path) as f: - class_names = f.readlines() - class_names = [c.strip() for c in class_names] - return class_names - - #---------------------------------------------------# - # 获得所有的先验框 - #---------------------------------------------------# - def _get_anchors(self): - anchors_path = os.path.expanduser(self.anchors_path) - with open(anchors_path) as f: - anchors = f.readline() - anchors = [float(x) for x in anchors.split(',')] - return np.array(anchors).reshape([-1, 3, 2])[::-1,:,:] + #---------------------------------------------------# + # 画框设置不同的颜色 + #---------------------------------------------------# + hsv_tuples = [(x / self.num_classes, 1., 1.) for x in range(self.num_classes)] + self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + self.colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors)) + self.generate() #---------------------------------------------------# # 生成模型 #---------------------------------------------------# def generate(self): #---------------------------------------------------# - # 建立yolov4模型 + # 建立yolo模型,载入yolo模型的权重 #---------------------------------------------------# - self.net = YoloBody(len(self.anchors[0]), len(self.class_names)).eval() + self.net = YoloBody(self.anchors_mask, self.num_classes) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.net.load_state_dict(torch.load(self.model_path, map_location=device)) + self.net = self.net.eval() + print('{} model, anchors, and classes loaded.'.format(self.model_path)) - #---------------------------------------------------# - # 载入yolov4模型的权重 - #---------------------------------------------------# - print('Loading weights into state dict...') - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - state_dict = torch.load(self.model_path, map_location=device) - self.net.load_state_dict(state_dict) - print('Finished!') - if self.cuda: self.net = nn.DataParallel(self.net) self.net = self.net.cuda() - #---------------------------------------------------# - # 建立三个特征层解码用的工具 - #---------------------------------------------------# - self.yolo_decodes = [] - for i in range(3): - self.yolo_decodes.append(DecodeBox(self.anchors[i], len(self.class_names), (self.model_image_size[1], self.model_image_size[0]))) - - print('{} model, anchors, and classes loaded.'.format(self.model_path)) - # 画框设置不同的颜色 - hsv_tuples = [(x / len(self.class_names), 1., 1.) - for x in range(len(self.class_names))] - self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) - self.colors = list( - map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), - self.colors)) - #---------------------------------------------------# # 检测图片 #---------------------------------------------------# def detect_image(self, image): + #---------------------------------------------------# + # 计算输入图片的高和宽 + #---------------------------------------------------# + image_shape = np.array(np.shape(image)[0:2]) #---------------------------------------------------------# # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB #---------------------------------------------------------# - image = image.convert('RGB') - - image_shape = np.array(np.shape(image)[0:2]) + image = cvtColor(image) #---------------------------------------------------------# # 给图像增加灰条,实现不失真的resize # 也可以直接resize进行识别 #---------------------------------------------------------# - if self.letterbox_image: - crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - else: - crop_img = image.resize((self.model_image_size[1],self.model_image_size[0]), Image.BICUBIC) - photo = np.array(crop_img,dtype = np.float32) / 255.0 - photo = np.transpose(photo, (2, 0, 1)) + image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image) #---------------------------------------------------------# # 添加上batch_size维度 #---------------------------------------------------------# - images = [photo] + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) with torch.no_grad(): - images = torch.from_numpy(np.asarray(images)) + images = torch.from_numpy(image_data) if self.cuda: images = images.cuda() - #---------------------------------------------------------# # 将图像输入网络当中进行预测! #---------------------------------------------------------# outputs = self.net(images) - output_list = [] - for i in range(3): - output_list.append(self.yolo_decodes[i](outputs[i])) - + outputs = self.bbox_util.decode_box(outputs) #---------------------------------------------------------# # 将预测框进行堆叠,然后进行非极大抑制 #---------------------------------------------------------# - output = torch.cat(output_list, 1) - batch_detections = non_max_suppression(output, len(self.class_names), - conf_thres=self.confidence, - nms_thres=self.iou) - - #---------------------------------------------------------# - # 如果没有检测出物体,返回原图 - #---------------------------------------------------------# - try: - batch_detections = batch_detections[0].cpu().numpy() - except: + results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if results[0] is None: return image - - #---------------------------------------------------------# - # 对预测框进行得分筛选 - #---------------------------------------------------------# - top_index = batch_detections[:,4] * batch_detections[:,5] > self.confidence - top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] - top_label = np.array(batch_detections[top_index,-1],np.int32) - top_bboxes = np.array(batch_detections[top_index,:4]) - top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) - #-----------------------------------------------------------------# - # 在图像传入网络预测前会进行letterbox_image给图像周围添加灰条 - # 因此生成的top_bboxes是相对于有灰条的图像的 - # 我们需要对其进行修改,去除灰条的部分。 - #-----------------------------------------------------------------# - if self.letterbox_image: - boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) - else: - top_xmin = top_xmin / self.model_image_size[1] * image_shape[1] - top_ymin = top_ymin / self.model_image_size[0] * image_shape[0] - top_xmax = top_xmax / self.model_image_size[1] * image_shape[1] - top_ymax = top_ymax / self.model_image_size[0] * image_shape[0] - boxes = np.concatenate([top_ymin,top_xmin,top_ymax,top_xmax], axis=-1) - - font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32')) - - thickness = max((np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0], 1) - - for i, c in enumerate(top_label): - predicted_class = self.class_names[c] - score = top_conf[i] + top_label = np.array(results[0][:, 6], dtype = 'int32') + top_conf = results[0][:, 4] * results[0][:, 5] + top_boxes = results[0][:, :4] + #---------------------------------------------------------# + # 设置字体与边框厚度 + #---------------------------------------------------------# + font = ImageFont.truetype(font='model_data/simhei.ttf', size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) + thickness = int(max((image.size[0] + image.size[1]) // np.mean(self.input_shape), 1)) + + #---------------------------------------------------------# + # 图像绘制 + #---------------------------------------------------------# + for i, c in list(enumerate(top_label)): + predicted_class = self.class_names[int(c)] + box = top_boxes[i] + score = top_conf[i] - top, left, bottom, right = boxes[i] - top = top - 5 - left = left - 5 - bottom = bottom + 5 - right = right + 5 + top, left, bottom, right = box - top = max(0, np.floor(top + 0.5).astype('int32')) - left = max(0, np.floor(left + 0.5).astype('int32')) - bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32')) - right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32')) + top = max(0, np.floor(top).astype('int32')) + left = max(0, np.floor(left).astype('int32')) + bottom = min(image.size[1], np.floor(bottom).astype('int32')) + right = min(image.size[0], np.floor(right).astype('int32')) - # 画框框 label = '{} {:.2f}'.format(predicted_class, score) draw = ImageDraw.Draw(image) label_size = draw.textsize(label, font) @@ -220,100 +178,113 @@ class YOLO(object): text_origin = np.array([left, top + 1]) for i in range(thickness): - draw.rectangle( - [left + i, top + i, right - i, bottom - i], - outline=self.colors[self.class_names.index(predicted_class)]) - draw.rectangle( - [tuple(text_origin), tuple(text_origin + label_size)], - fill=self.colors[self.class_names.index(predicted_class)]) + draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c]) + draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c]) draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font) del draw + return image def get_FPS(self, image, test_interval): - # 调整图片使其符合输入要求 image_shape = np.array(np.shape(image)[0:2]) - + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) #---------------------------------------------------------# # 给图像增加灰条,实现不失真的resize # 也可以直接resize进行识别 #---------------------------------------------------------# - if self.letterbox_image: - crop_img = np.array(letterbox_image(image, (self.model_image_size[1],self.model_image_size[0]))) - else: - crop_img = image.convert('RGB') - crop_img = crop_img.resize((self.model_image_size[1],self.model_image_size[0]), Image.BICUBIC) - photo = np.array(crop_img,dtype = np.float32) / 255.0 - photo = np.transpose(photo, (2, 0, 1)) + image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image) #---------------------------------------------------------# # 添加上batch_size维度 #---------------------------------------------------------# - images = [photo] + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) with torch.no_grad(): - images = torch.from_numpy(np.asarray(images)) + images = torch.from_numpy(image_data) if self.cuda: images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# outputs = self.net(images) - output_list = [] - for i in range(3): - output_list.append(self.yolo_decodes[i](outputs[i])) - output = torch.cat(output_list, 1) - batch_detections = non_max_suppression(output, len(self.class_names), - conf_thres=self.confidence, - nms_thres=self.iou) - try: - batch_detections = batch_detections[0].cpu().numpy() - top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence - top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] - top_label = np.array(batch_detections[top_index,-1],np.int32) - top_bboxes = np.array(batch_detections[top_index,:4]) - top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) - - if self.letterbox_image: - boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) - else: - top_xmin = top_xmin / self.model_image_size[1] * image_shape[1] - top_ymin = top_ymin / self.model_image_size[0] * image_shape[0] - top_xmax = top_xmax / self.model_image_size[1] * image_shape[1] - top_ymax = top_ymax / self.model_image_size[0] * image_shape[0] - boxes = np.concatenate([top_ymin,top_xmin,top_ymax,top_xmax], axis=-1) - - except: - pass - + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres=self.confidence, nms_thres=self.nms_iou) + t1 = time.time() for _ in range(test_interval): with torch.no_grad(): + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# outputs = self.net(images) - output_list = [] - for i in range(3): - output_list.append(self.yolo_decodes[i](outputs[i])) - output = torch.cat(output_list, 1) - batch_detections = non_max_suppression(output, len(self.class_names), - conf_thres=self.confidence, - nms_thres=self.iou) - try: - batch_detections = batch_detections[0].cpu().numpy() - top_index = batch_detections[:,4]*batch_detections[:,5] > self.confidence - top_conf = batch_detections[top_index,4]*batch_detections[top_index,5] - top_label = np.array(batch_detections[top_index,-1],np.int32) - top_bboxes = np.array(batch_detections[top_index,:4]) - top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(top_bboxes[:,0],-1),np.expand_dims(top_bboxes[:,1],-1),np.expand_dims(top_bboxes[:,2],-1),np.expand_dims(top_bboxes[:,3],-1) - - if self.letterbox_image: - boxes = yolo_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape) - else: - top_xmin = top_xmin / self.model_image_size[1] * image_shape[1] - top_ymin = top_ymin / self.model_image_size[0] * image_shape[0] - top_xmax = top_xmax / self.model_image_size[1] * image_shape[1] - top_ymax = top_ymax / self.model_image_size[0] * image_shape[0] - boxes = np.concatenate([top_ymin,top_xmin,top_ymax,top_xmax], axis=-1) - - except: - pass - + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres=self.confidence, nms_thres=self.nms_iou) + t2 = time.time() tact_time = (t2 - t1) / test_interval return tact_time + def get_map_txt(self, image_id, image, class_names, map_out_path): + f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"),"w") + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(torch.cat(outputs, 1), self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if results[0] is None: + return + + top_label = np.array(results[0][:, 6], dtype = 'int32') + top_conf = results[0][:, 4] * results[0][:, 5] + top_boxes = results[0][:, :4] + + for i, c in list(enumerate(top_label)): + predicted_class = self.class_names[int(c)] + box = top_boxes[i] + score = str(top_conf[i]) + + top, left, bottom, right = box + if predicted_class not in class_names: + continue + + f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom)))) + + f.close() + return