Add files via upload

34d2e3e1 · Bubbliiiing · GitHub · 52358268 · 34d2e3e1 · 34d2e3e1
18 changed file
--- a/get_map.py
+++ b/get_map.py
--- a/kmeans_for_anchors.py
+++ b/kmeans_for_anchors.py
@@ -24,32 +24,45 @@ def avg_iou(box,cluster):
    return np.mean([np.max(cas_iou(box[i],cluster)) for i in range(box.shape[0])])

 def kmeans(box,k):
-    # 取出一共有多少框
+    #-------------------------------------------------------------#
+    #   取出一共有多少框
+    #-------------------------------------------------------------#
    row = box.shape[0]
    
-    # 每个框各个点的位置
+    #-------------------------------------------------------------#
+    #   每个框各个点的位置
+    #-------------------------------------------------------------#
    distance = np.empty((row,k))
    
-    # 最后的聚类位置
+    #-------------------------------------------------------------#
+    #   最后的聚类位置
+    #-------------------------------------------------------------#
    last_clu = np.zeros((row,))

    np.random.seed()

-    # 随机选5个当聚类中心
+    #-------------------------------------------------------------#
+    #   随机选5个当聚类中心
+    #-------------------------------------------------------------#
    cluster = box[np.random.choice(row,k,replace = False)]
-    # cluster = random.sample(row, k)
    while True:
-        # 计算每一行距离五个点的iou情况。
+        #-------------------------------------------------------------#
+        #   计算每一行距离五个点的iou情况。
+        #-------------------------------------------------------------#
        for i in range(row):
            distance[i] = 1 - cas_iou(box[i],cluster)
        
-        # 取出最小点
+        #-------------------------------------------------------------#
+        #   取出最小点
+        #-------------------------------------------------------------#
        near = np.argmin(distance,axis=1)

        if (last_clu == near).all():
            break
        
-        # 求每一个类的中位点
+        #-------------------------------------------------------------#
+        #   求每一个类的中位点
+        #-------------------------------------------------------------#
        for j in range(k):
            cluster[j] = np.median(
                box[near == j],axis=0)
@@ -60,7 +73,9 @@ def kmeans(box,k):

 def load_data(path):
    data = []
-    # 对于每一个xml都寻找box
+    #-------------------------------------------------------------#
+    #   对于每一个xml都寻找box
+    #-------------------------------------------------------------#
    for xml_file in glob.glob('{}/*xml'.format(path)):
        tree = ET.parse(xml_file)
        height = int(tree.findtext('./size/height'))
@@ -68,7 +83,9 @@ def load_data(path):
        if height<=0 or width<=0:
            continue
        
-        # 对于每一个目标都获得它的宽高
+        #-------------------------------------------------------------#
+        #   对于每一个目标都获得它的宽高
+        #-------------------------------------------------------------#
        for obj in tree.iter('object'):
            xmin = int(float(obj.findtext('bndbox/xmin'))) / width
            ymin = int(float(obj.findtext('bndbox/ymin'))) / height
@@ -85,18 +102,26 @@ def load_data(path):


 if __name__ == '__main__':
-    # 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
-    # 会生成yolo_anchors.txt
-    SIZE = 416
+    #-------------------------------------------------------------#
+    #   运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
+    #   会生成yolo_anchors.txt
+    #-------------------------------------------------------------#
+    SIZE        = 416
    anchors_num = 9
-    # 载入数据集，可以使用VOC的xml
-    path = r'./VOCdevkit/VOC2007/Annotations'
+    #-------------------------------------------------------------#
+    #   载入数据集，可以使用VOC的xml
+    #-------------------------------------------------------------#
+    path        = r'./VOCdevkit/VOC2007/Annotations'
    
-    # 载入所有的xml
-    # 存储格式为转化为比例后的width,height
+    #-------------------------------------------------------------#
+    #   载入所有的xml
+    #   存储格式为转化为比例后的width,height
+    #-------------------------------------------------------------#
    data = load_data(path)
    
-    # 使用k聚类算法
+    #-------------------------------------------------------------#
+    #   使用k聚类算法
+    #-------------------------------------------------------------#
    out = kmeans(data,anchors_num)
    out = out[np.argsort(out[:,0])]
    print('acc:{:.2f}%'.format(avg_iou(data,out) * 100))

--- a/nets/CSPdarknet.py
+++ b/nets/CSPdarknet.py
 import math
+from collections import OrderedDict

 import torch
 import torch.nn as nn

--- a/nets/__init__.py
+++ b/nets/__init__.py
+#
\ No newline at end of file
--- a/nets/yolo.py
+++ b/nets/yolo.py
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from nets.CSPdarknet import darknet53
+
+
+def conv2d(filter_in, filter_out, kernel_size, stride=1):
+    pad = (kernel_size - 1) // 2 if kernel_size else 0
+    return nn.Sequential(OrderedDict([
+        ("conv", nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad, bias=False)),
+        ("bn", nn.BatchNorm2d(filter_out)),
+        ("relu", nn.LeakyReLU(0.1)),
+    ]))
+
+#---------------------------------------------------#
+#   SPP结构，利用不同大小的池化核进行池化
+#   池化后堆叠
+#---------------------------------------------------#
+class SpatialPyramidPooling(nn.Module):
+    def __init__(self, pool_sizes=[5, 9, 13]):
+        super(SpatialPyramidPooling, self).__init__()
+
+        self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, pool_size//2) for pool_size in pool_sizes])
+
+    def forward(self, x):
+        features = [maxpool(x) for maxpool in self.maxpools[::-1]]
+        features = torch.cat(features + [x], dim=1)
+
+        return features
+
+#---------------------------------------------------#
+#   卷积 + 上采样
+#---------------------------------------------------#
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(Upsample, self).__init__()
+
+        self.upsample = nn.Sequential(
+            conv2d(in_channels, out_channels, 1),
+            nn.Upsample(scale_factor=2, mode='nearest')
+        )
+
+    def forward(self, x,):
+        x = self.upsample(x)
+        return x
+
+#---------------------------------------------------#
+#   三次卷积块
+#---------------------------------------------------#
+def make_three_conv(filters_list, in_filters):
+    m = nn.Sequential(
+        conv2d(in_filters, filters_list[0], 1),
+        conv2d(filters_list[0], filters_list[1], 3),
+        conv2d(filters_list[1], filters_list[0], 1),
+    )
+    return m
+
+#---------------------------------------------------#
+#   五次卷积块
+#---------------------------------------------------#
+def make_five_conv(filters_list, in_filters):
+    m = nn.Sequential(
+        conv2d(in_filters, filters_list[0], 1),
+        conv2d(filters_list[0], filters_list[1], 3),
+        conv2d(filters_list[1], filters_list[0], 1),
+        conv2d(filters_list[0], filters_list[1], 3),
+        conv2d(filters_list[1], filters_list[0], 1),
+    )
+    return m
+
+#---------------------------------------------------#
+#   最后获得yolov4的输出
+#---------------------------------------------------#
+def yolo_head(filters_list, in_filters):
+    m = nn.Sequential(
+        conv2d(in_filters, filters_list[0], 3),
+        nn.Conv2d(filters_list[0], filters_list[1], 1),
+    )
+    return m
+
+#---------------------------------------------------#
+#   yolo_body
+#---------------------------------------------------#
+class YoloBody(nn.Module):
+    def __init__(self, anchors_mask, num_classes):
+        super(YoloBody, self).__init__()
+        #---------------------------------------------------#   
+        #   生成CSPdarknet53的主干模型
+        #   获得三个有效特征层，他们的shape分别是：
+        #   52,52,256
+        #   26,26,512
+        #   13,13,1024
+        #---------------------------------------------------#
+        self.backbone = darknet53(None)
+
+        self.conv1      = make_three_conv([512,1024],1024)
+        self.SPP        = SpatialPyramidPooling()
+        self.conv2      = make_three_conv([512,1024],2048)
+
+        self.upsample1          = Upsample(512,256)
+        self.conv_for_P4        = conv2d(512,256,1)
+        self.make_five_conv1    = make_five_conv([256, 512],512)
+
+        self.upsample2          = Upsample(256,128)
+        self.conv_for_P3        = conv2d(256,128,1)
+        self.make_five_conv2    = make_five_conv([128, 256],256)
+
+        # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
+        self.yolo_head3         = yolo_head([256, len(anchors_mask[0]) * (5 + num_classes)],128)
+
+        self.down_sample1       = conv2d(128,256,3,stride=2)
+        self.make_five_conv3    = make_five_conv([256, 512],512)
+
+        # 3*(5+num_classes) = 3*(5+20) = 3*(4+1+20)=75
+        self.yolo_head2         = yolo_head([512, len(anchors_mask[1]) * (5 + num_classes)],256)
+
+        self.down_sample2       = conv2d(256,512,3,stride=2)
+        self.make_five_conv4    = make_five_conv([512, 1024],1024)
+
+        # 3*(5+num_classes)=3*(5+20)=3*(4+1+20)=75
+        self.yolo_head1         = yolo_head([1024, len(anchors_mask[2]) * (5 + num_classes)],512)
+
+
+    def forward(self, x):
+        #  backbone
+        x2, x1, x0 = self.backbone(x)
+
+        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,2048 
+        P5 = self.conv1(x0)
+        P5 = self.SPP(P5)
+        # 13,13,2048 -> 13,13,512 -> 13,13,1024 -> 13,13,512
+        P5 = self.conv2(P5)
+
+        # 13,13,512 -> 13,13,256 -> 26,26,256
+        P5_upsample = self.upsample1(P5)
+        # 26,26,512 -> 26,26,256
+        P4 = self.conv_for_P4(x1)
+        # 26,26,256 + 26,26,256 -> 26,26,512
+        P4 = torch.cat([P4,P5_upsample],axis=1)
+        # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
+        P4 = self.make_five_conv1(P4)
+
+        # 26,26,256 -> 26,26,128 -> 52,52,128
+        P4_upsample = self.upsample2(P4)
+        # 52,52,256 -> 52,52,128
+        P3 = self.conv_for_P3(x2)
+        # 52,52,128 + 52,52,128 -> 52,52,256
+        P3 = torch.cat([P3,P4_upsample],axis=1)
+        # 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
+        P3 = self.make_five_conv2(P3)
+
+        # 52,52,128 -> 26,26,256
+        P3_downsample = self.down_sample1(P3)
+        # 26,26,256 + 26,26,256 -> 26,26,512
+        P4 = torch.cat([P3_downsample,P4],axis=1)
+        # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
+        P4 = self.make_five_conv3(P4)
+
+        # 26,26,256 -> 13,13,512
+        P4_downsample = self.down_sample2(P4)
+        # 13,13,512 + 13,13,512 -> 13,13,1024
+        P5 = torch.cat([P4_downsample,P5],axis=1)
+        # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
+        P5 = self.make_five_conv4(P5)
+
+        #---------------------------------------------------#
+        #   第三个特征层
+        #   y3=(batch_size,75,52,52)
+        #---------------------------------------------------#
+        out2 = self.yolo_head3(P3)
+        #---------------------------------------------------#
+        #   第二个特征层
+        #   y2=(batch_size,75,26,26)
+        #---------------------------------------------------#
+        out1 = self.yolo_head2(P4)
+        #---------------------------------------------------#
+        #   第一个特征层
+        #   y1=(batch_size,75,13,13)
+        #---------------------------------------------------#
+        out0 = self.yolo_head1(P5)
+
+        return out0, out1, out2
+
--- a/nets/yolo_training.py
+++ b/nets/yolo_training.py
--- a/predict.py
+++ b/predict.py
-#----------------------------------------------------#
-#   对视频中的predict.py进行了修改，
-#   将单张图片预测、摄像头检测和FPS测试功能
+#-----------------------------------------------------------------------#
+#   predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能
 #   整合到了一个py文件中，通过指定mode进行模式的修改。
-#----------------------------------------------------#
+#-----------------------------------------------------------------------#
 import time

 import cv2
@@ -13,33 +12,44 @@ from yolo import YOLO

 if __name__ == "__main__":
    yolo = YOLO()
-    #-------------------------------------------------------------------------#
+    #----------------------------------------------------------------------------------------------------------#
    #   mode用于指定测试的模式：
-    #   'predict'表示单张图片预测
-    #   'video'表示视频检测
-    #   'fps'表示测试fps
-    #-------------------------------------------------------------------------#
+    #   'predict'表示单张图片预测，如果想对预测过程进行修改，如保存图片，截取对象等，可以先看下方详细的注释
+    #   'video'表示视频检测，可调用摄像头或者视频进行检测，详情查看下方注释。
+    #   'fps'表示测试fps，使用的图片是img里面的street.jpg，详情查看下方注释。
+    #   'dir_predict'表示遍历文件夹进行检测并保存。默认遍历img文件夹，保存img_out文件夹，详情查看下方注释。
+    #----------------------------------------------------------------------------------------------------------#
    mode = "predict"
    #-------------------------------------------------------------------------#
    #   video_path用于指定视频的路径，当video_path=0时表示检测摄像头
    #   video_save_path表示视频保存的路径，当video_save_path=""时表示不保存
    #   video_fps用于保存的视频的fps
    #   video_path、video_save_path和video_fps仅在mode='video'时有效
-    #   保存视频时需要ctrl+c退出才会完成完整的保存步骤，不可直接结束程序。
+    #   保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
    #-------------------------------------------------------------------------#
    video_path      = 0
    video_save_path = ""
    video_fps       = 25.0
+    #-------------------------------------------------------------------------#
+    #   test_interval用于指定测量fps的时候，图片检测的次数
+    #   理论上test_interval越大，fps越准确。
+    #-------------------------------------------------------------------------#
+    test_interval   = 100
+    #-------------------------------------------------------------------------#
+    #   dir_origin_path指定了用于检测的图片的文件夹路径
+    #   dir_save_path指定了检测完图片的保存路径
+    #   dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
+    #-------------------------------------------------------------------------#
+    dir_origin_path = "img/"
+    dir_save_path   = "img_out/"

    if mode == "predict":
        '''
-        1、该代码无法直接进行批量预测，如果想要批量预测，可以利用os.listdir()遍历文件夹，利用Image.open打开图片文件进行预测。
-        具体流程可以参考get_dr_txt.py，在get_dr_txt.py即实现了遍历还实现了目标信息的保存。
-        2、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
-        3、如果想要获得预测框的坐标，可以进入yolo.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
-        4、如果想要利用预测框截取下目标，可以进入yolo.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
+        1、如果想要进行检测完的图片的保存，利用r_image.save("img.jpg")即可保存，直接在predict.py里进行修改即可。 
+        2、如果想要获得预测框的坐标，可以进入yolo.detect_image函数，在绘图部分读取top，left，bottom，right这四个值。
+        3、如果想要利用预测框截取下目标，可以进入yolo.detect_image函数，在绘图部分利用获取到的top，left，bottom，right这四个值
        在原图上利用矩阵的方式进行截取。
-        5、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入yolo.detect_image函数，在绘图部分对predicted_class进行判断，
+        4、如果想要在预测图上写额外的字，比如检测到的特定目标的数量，可以进入yolo.detect_image函数，在绘图部分对predicted_class进行判断，
        比如判断if predicted_class == 'car': 即可判断当前目标是否为车，然后记录数量即可。利用draw.text即可写字。
        '''
        while True:
@@ -54,11 +64,11 @@ if __name__ == "__main__":
                r_image.show()

    elif mode == "video":
-        capture=cv2.VideoCapture(video_path)
+        capture = cv2.VideoCapture(video_path)
        if video_save_path!="":
-            fourcc = cv2.VideoWriter_fourcc(*'XVID')
-            size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
-            out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
+            fourcc  = cv2.VideoWriter_fourcc(*'XVID')
+            size    = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+            out     = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)

        fps = 0.0
        while(True):
@@ -91,9 +101,23 @@ if __name__ == "__main__":
        cv2.destroyAllWindows()

    elif mode == "fps":
-        test_interval = 100
        img = Image.open('img/street.jpg')
        tact_time = yolo.get_FPS(img, test_interval)
        print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1')
+
+    elif mode == "dir_predict":
+        import os
+        from tqdm import tqdm
+
+        img_names = os.listdir(dir_origin_path)
+        for img_name in tqdm(img_names):
+            if img_name.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
+                image_path  = os.path.join(dir_origin_path, img_name)
+                image       = Image.open(image_path)
+                r_image     = yolo.detect_image(image)
+                if not os.path.exists(dir_save_path):
+                    os.makedirs(dir_save_path)
+                r_image.save(os.path.join(dir_save_path, img_name))
+                
    else:
-        raise AssertionError("Please specify the correct mode: 'predict', 'video' or 'fps'.")
+        raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps' or 'dir_predict'.")
--- a/summary.py
+++ b/summary.py
+#--------------------------------------------#
+#   该部分代码用于看网络结构
+#--------------------------------------------#
+import torch
+from torchsummary import summary
+
+from nets.yolo import YoloBody
+
+if __name__ == "__main__":
+    # 需要使用device来指定网络在GPU还是CPU运行
+    device  = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    m       = YoloBody([[6, 7, 8], [3, 4, 5], [0, 1, 2]], 80).to(device)
+    summary(m, input_size=(3, 416, 416))
--- a/train.py
+++ b/train.py
--- a/utils/__init__.py
+++ b/utils/__init__.py
+#
\ No newline at end of file
--- a/utils/callbacks.py
+++ b/utils/callbacks.py
+import os
+
+import scipy.signal
+from matplotlib import pyplot as plt
+
+
+class LossHistory():
+    def __init__(self, log_dir):
+        import datetime
+        curr_time = datetime.datetime.now()
+        time_str = datetime.datetime.strftime(curr_time,'%Y_%m_%d_%H_%M_%S')
+        self.log_dir    = log_dir
+        self.time_str   = time_str
+        self.save_path  = os.path.join(self.log_dir, "loss_" + str(self.time_str))
+        self.losses     = []
+        self.val_loss   = []
+        
+        os.makedirs(self.save_path)
+
+    def append_loss(self, loss, val_loss):
+        self.losses.append(loss)
+        self.val_loss.append(val_loss)
+        with open(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f:
+            f.write(str(loss))
+            f.write("\n")
+        with open(os.path.join(self.save_path, "epoch_val_loss_" + str(self.time_str) + ".txt"), 'a') as f:
+            f.write(str(val_loss))
+            f.write("\n")
+        self.loss_plot()
+
+    def loss_plot(self):
+        iters = range(len(self.losses))
+
+        plt.figure()
+        plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss')
+        plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss')
+        try:
+            if len(self.losses) < 25:
+                num = 5
+            else:
+                num = 15
+            
+            plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss')
+            plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss')
+        except:
+            pass
+
+        plt.grid(True)
+        plt.xlabel('Epoch')
+        plt.ylabel('Loss')
+        plt.legend(loc="upper right")
+
+        plt.savefig(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".png"))
+
+        plt.cla()
+        plt.close("all")
--- a/utils/dataloader.py
+++ b/utils/dataloader.py
+from random import sample, shuffle
+
 import cv2
 import numpy as np
 from PIL import Image
 from torch.utils.data.dataset import Dataset

-from utils.utils import merge_bboxes
+from utils.utils import cvtColor, preprocess_input


 class YoloDataset(Dataset):
-    def __init__(self, train_lines, image_size, mosaic=True, is_train=True):
+    def __init__(self, annotation_lines, input_shape, num_classes, mosaic, train):
        super(YoloDataset, self).__init__()
-
-        self.train_lines = train_lines
-        self.train_batches = len(train_lines)
-        self.image_size = image_size
-        self.mosaic = mosaic
-        self.flag = True
-        self.is_train = is_train
+        self.annotation_lines   = annotation_lines
+        self.input_shape        = input_shape
+        self.num_classes        = num_classes
+        self.length             = len(self.annotation_lines)
+        self.mosaic             = mosaic
+        self.train              = train

    def __len__(self):
-        return self.train_batches
+        return self.length
+
+    def __getitem__(self, index):
+        index       = index % self.length
+        #---------------------------------------------------#
+        #   训练时进行数据的随机增强
+        #   验证时不进行数据的随机增强
+        #---------------------------------------------------#
+        if self.mosaic:
+            if self.rand() < 0.5:
+                lines = sample(self.annotation_lines, 3)
+                lines.append(self.annotation_lines[index])
+                shuffle(lines)
+                image, box  = self.get_random_data_with_Mosaic(lines, self.input_shape)
+            else:
+                image, box  = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train)
+        else:
+            image, box      = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train)
+        image       = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
+        box         = np.array(box, dtype=np.float32)
+        if len(box) != 0:
+            box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
+            box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
+
+            box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
+            box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
+        return image, box

    def rand(self, a=0, b=1):
-        return np.random.rand() * (b - a) + a
+        return np.random.rand()*(b-a) + a

    def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):
-        """实时数据增强的随机预处理"""
-        line = annotation_line.split()
-        image = Image.open(line[0])
-        iw, ih = image.size
-        h, w = input_shape
-        box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
+        line    = annotation_line.split()
+        #------------------------------#
+        #   读取图像并转换成RGB图像
+        #------------------------------#
+        image   = Image.open(line[0])
+        image   = cvtColor(image)
+        #------------------------------#
+        #   获得图像的高宽与目标高宽
+        #------------------------------#
+        iw, ih  = image.size
+        h, w    = input_shape
+        #------------------------------#
+        #   获得预测框
+        #------------------------------#
+        box     = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])

        if not random:
            scale = min(w/iw, h/ih)
@@ -38,56 +74,64 @@ class YoloDataset(Dataset):
            dx = (w-nw)//2
            dy = (h-nh)//2

-            image = image.resize((nw,nh), Image.BICUBIC)
-            new_image = Image.new('RGB', (w,h), (128,128,128))
+            #---------------------------------#
+            #   将图像多余的部分加上灰条
+            #---------------------------------#
+            image       = image.resize((nw,nh), Image.BICUBIC)
+            new_image   = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
-            image_data = np.array(new_image, np.float32)
+            image_data  = np.array(new_image, np.float32)

-            # 调整目标框坐标
-            box_data = np.zeros((len(box), 5))
-            if len(box) > 0:
+            #---------------------------------#
+            #   对真实框进行调整
+            #---------------------------------#
+            if len(box)>0:
                np.random.shuffle(box)
-                box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
-                box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
-                box[:, 0:2][box[:, 0:2] < 0] = 0
-                box[:, 2][box[:, 2] > w] = w
-                box[:, 3][box[:, 3] > h] = h
+                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+                box[:, 0:2][box[:, 0:2]<0] = 0
+                box[:, 2][box[:, 2]>w] = w
+                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
-                box = box[np.logical_and(box_w > 1, box_h > 1)]  # 保留有效框
-                box_data = np.zeros((len(box), 5))
-                box_data[:len(box)] = box
-
-            return image_data, box_data
-
-        # 调整图片大小
-        new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter)
+                box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
+
+            return image_data, box
+                
+        #------------------------------------------#
+        #   对图像进行缩放并且进行长和宽的扭曲
+        #------------------------------------------#
+        new_ar = w/h * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
        scale = self.rand(.25, 2)
        if new_ar < 1:
-            nh = int(scale * h)
-            nw = int(nh * new_ar)
+            nh = int(scale*h)
+            nw = int(nh*new_ar)
        else:
-            nw = int(scale * w)
-            nh = int(nw / new_ar)
-        image = image.resize((nw, nh), Image.BICUBIC)
-
-        # 放置图片
-        dx = int(self.rand(0, w - nw))
-        dy = int(self.rand(0, h - nh))
-        new_image = Image.new('RGB', (w, h),
-                              (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)))
+            nw = int(scale*w)
+            nh = int(nw/new_ar)
+        image = image.resize((nw,nh), Image.BICUBIC)
+
+        #------------------------------------------#
+        #   将图像多余的部分加上灰条
+        #------------------------------------------#
+        dx = int(self.rand(0, w-nw))
+        dy = int(self.rand(0, h-nh))
+        new_image = Image.new('RGB', (w,h), (128,128,128))
        new_image.paste(image, (dx, dy))
        image = new_image

-        # 是否翻转图片
-        flip = self.rand() < .5
-        if flip:
-            image = image.transpose(Image.FLIP_LEFT_RIGHT)
+        #------------------------------------------#
+        #   翻转图像
+        #------------------------------------------#
+        flip = self.rand()<.5
+        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)

-        # 色域变换
+        #------------------------------------------#
+        #   色域扭曲
+        #------------------------------------------#
        hue = self.rand(-hue, hue)
-        sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat)
-        val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val)
+        sat = self.rand(1, sat) if self.rand()<.5 else 1/self.rand(1, sat)
+        val = self.rand(1, val) if self.rand()<.5 else 1/self.rand(1, val)
        x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV)
        x[..., 0] += hue*360
        x[..., 0][x[..., 0]>1] -= 1
@@ -99,112 +143,134 @@ class YoloDataset(Dataset):
        x[x<0] = 0
        image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255

-        # 调整目标框坐标
-        box_data = np.zeros((len(box), 5))
-        if len(box) > 0:
+        #---------------------------------#
+        #   对真实框进行调整
+        #---------------------------------#
+        if len(box)>0:
            np.random.shuffle(box)
-            box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
-            box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
-            if flip:
-                box[:, [0, 2]] = w - box[:, [2, 0]]
-            box[:, 0:2][box[:, 0:2] < 0] = 0
-            box[:, 2][box[:, 2] > w] = w
-            box[:, 3][box[:, 3] > h] = h
+            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+            if flip: box[:, [0,2]] = w - box[:, [2,0]]
+            box[:, 0:2][box[:, 0:2]<0] = 0
+            box[:, 2][box[:, 2]>w] = w
+            box[:, 3][box[:, 3]>h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
-            box = box[np.logical_and(box_w > 1, box_h > 1)]  # 保留有效框
-            box_data = np.zeros((len(box), 5))
-            box_data[:len(box)] = box
-
-        return image_data, box_data
-
-    def get_random_data_with_Mosaic(self, annotation_line, input_shape, hue=.1, sat=1.5, val=1.5):
+            box = box[np.logical_and(box_w>1, box_h>1)] 
+        
+        return image_data, box
+    
+    def merge_bboxes(self, bboxes, cutx, cuty):
+        merge_bbox = []
+        for i in range(len(bboxes)):
+            for box in bboxes[i]:
+                tmp_box = []
+                x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
+
+                if i == 0:
+                    if y1 > cuty or x1 > cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y2 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x2 = cutx
+
+                if i == 1:
+                    if y2 < cuty or x1 > cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y1 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x2 = cutx
+
+                if i == 2:
+                    if y2 < cuty or x2 < cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y1 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x1 = cutx
+
+                if i == 3:
+                    if y1 > cuty or x2 < cutx:
+                        continue
+                    if y2 >= cuty and y1 <= cuty:
+                        y2 = cuty
+                    if x2 >= cutx and x1 <= cutx:
+                        x1 = cutx
+                tmp_box.append(x1)
+                tmp_box.append(y1)
+                tmp_box.append(x2)
+                tmp_box.append(y2)
+                tmp_box.append(box[-1])
+                merge_bbox.append(tmp_box)
+        return merge_bbox
+
+    def get_random_data_with_Mosaic(self, annotation_line, input_shape, max_boxes=100, hue=.1, sat=1.5, val=1.5):
        h, w = input_shape
-        min_offset_x = 0.3
-        min_offset_y = 0.3
-        scale_low = 1 - min(min_offset_x, min_offset_y)
-        scale_high = scale_low + 0.2
-
-        image_datas = []
-        box_datas = []
-        index = 0
-
-        place_x = [0, 0, int(w * min_offset_x), int(w * min_offset_x)]
-        place_y = [0, int(h * min_offset_y), int(h * min_offset_y), 0]
+        min_offset_x = self.rand(0.25, 0.75)
+        min_offset_y = self.rand(0.25, 0.75)
+
+        nws     = [ int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1)), int(w * self.rand(0.4, 1))]
+        nhs     = [ int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1)), int(h * self.rand(0.4, 1))]
+        
+        place_x = [int(w*min_offset_x) - nws[0], int(w*min_offset_x) - nws[1], int(w*min_offset_x), int(w*min_offset_x)]
+        place_y = [int(h*min_offset_y) - nhs[0], int(h*min_offset_y), int(h*min_offset_y), int(h*min_offset_y) - nhs[3]]
+
+        image_datas = [] 
+        box_datas   = []
+        index       = 0
        for line in annotation_line:
            # 每一行进行分割
            line_content = line.split()
            # 打开图片
            image = Image.open(line_content[0])
-            image = image.convert("RGB")
+            image = cvtColor(image)
+            
            # 图片的大小
            iw, ih = image.size
            # 保存框的位置
-            box = np.array([np.array(list(map(int, box.split(',')))) for box in line_content[1:]])
-
+            box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
+            
            # 是否翻转图片
-            flip = self.rand() < .5
-            if flip and len(box) > 0:
+            flip = self.rand()<.5
+            if flip and len(box)>0:
                image = image.transpose(Image.FLIP_LEFT_RIGHT)
-                box[:, [0, 2]] = iw - box[:, [2, 0]]
-
-            # 对输入进来的图片进行缩放
-            new_ar = w / h
-            scale = self.rand(scale_low, scale_high)
-            if new_ar < 1:
-                nh = int(scale * h)
-                nw = int(nh * new_ar)
-            else:
-                nw = int(scale * w)
-                nh = int(nw / new_ar)
-            image = image.resize((nw, nh), Image.BICUBIC)
-
-            # 进行色域变换
-            hue = self.rand(-hue, hue)
-            sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat)
-            val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val)
-            x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV)
-            x[..., 0] += hue*360
-            x[..., 0][x[..., 0]>1] -= 1
-            x[..., 0][x[..., 0]<0] += 1
-            x[..., 1] *= sat
-            x[..., 2] *= val
-            x[x[:,:, 0]>360, 0] = 360
-            x[:, :, 1:][x[:, :, 1:]>1] = 1
-            x[x<0] = 0
-            image = cv2.cvtColor(x, cv2.COLOR_HSV2RGB) # numpy array, 0 to 1
-
-            image = Image.fromarray((image * 255).astype(np.uint8))
+                box[:, [0,2]] = iw - box[:, [2,0]]
+
+            nw = nws[index] 
+            nh = nhs[index] 
+            image = image.resize((nw,nh), Image.BICUBIC)
+
            # 将图片进行放置，分别对应四张分割图片的位置
            dx = place_x[index]
            dy = place_y[index]
-            new_image = Image.new('RGB', (w, h),
-                                  (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)))
+            new_image = Image.new('RGB', (w,h), (128,128,128))
            new_image.paste(image, (dx, dy))
            image_data = np.array(new_image)

            index = index + 1
            box_data = []
            # 对box进行重新处理
-            if len(box) > 0:
+            if len(box)>0:
                np.random.shuffle(box)
-                box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
-                box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
-                box[:, 0:2][box[:, 0:2] < 0] = 0
-                box[:, 2][box[:, 2] > w] = w
-                box[:, 3][box[:, 3] > h] = h
+                box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+                box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+                box[:, 0:2][box[:, 0:2]<0] = 0
+                box[:, 2][box[:, 2]>w] = w
+                box[:, 3][box[:, 3]>h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
-                box = box[np.logical_and(box_w > 1, box_h > 1)]
-                box_data = np.zeros((len(box), 5))
+                box = box[np.logical_and(box_w>1, box_h>1)]
+                box_data = np.zeros((len(box),5))
                box_data[:len(box)] = box
-
+            
            image_datas.append(image_data)
            box_datas.append(box_data)

        # 将图片分割，放在一起
-        cutx = np.random.randint(int(w * min_offset_x), int(w * (1 - min_offset_x)))
-        cuty = np.random.randint(int(h * min_offset_y), int(h * (1 - min_offset_y)))
+        cutx = int(w * min_offset_x)
+        cuty = int(h * min_offset_y)

        new_image = np.zeros([h, w, 3])
        new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
@@ -212,47 +278,26 @@ class YoloDataset(Dataset):
        new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
        new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]

+        # 进行色域变换
+        hue = self.rand(-hue, hue)
+        sat = self.rand(1, sat) if self.rand()<.5 else 1/self.rand(1, sat)
+        val = self.rand(1, val) if self.rand()<.5 else 1/self.rand(1, val)
+        x = cv2.cvtColor(np.array(new_image/255,np.float32), cv2.COLOR_RGB2HSV)
+        x[..., 0] += hue*360
+        x[..., 0][x[..., 0]>1] -= 1
+        x[..., 0][x[..., 0]<0] += 1
+        x[..., 1] *= sat
+        x[..., 2] *= val
+        x[x[:, :, 0]>360, 0] = 360
+        x[:, :, 1:][x[:, :, 1:]>1] = 1
+        x[x<0] = 0
+        new_image = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255
+
        # 对框进行进一步的处理
-        new_boxes = np.array(merge_bboxes(box_datas, cutx, cuty))
+        new_boxes = self.merge_bboxes(box_datas, cutx, cuty)

        return new_image, new_boxes

-    def __getitem__(self, index):
-        lines = self.train_lines
-        n = self.train_batches
-        index = index % n
-        if self.mosaic:
-            if self.flag and (index + 4) < n:
-                img, y = self.get_random_data_with_Mosaic(lines[index:index + 4], self.image_size[0:2])
-            else:
-                img, y = self.get_random_data(lines[index], self.image_size[0:2], random=self.is_train)
-            self.flag = bool(1-self.flag)
-        else:
-            img, y = self.get_random_data(lines[index], self.image_size[0:2], random=self.is_train)
-
-        if len(y) != 0:
-            # 从坐标转换成0~1的百分比
-            boxes = np.array(y[:, :4], dtype=np.float32)
-            boxes[:, 0] = boxes[:, 0] / self.image_size[1]
-            boxes[:, 1] = boxes[:, 1] / self.image_size[0]
-            boxes[:, 2] = boxes[:, 2] / self.image_size[1]
-            boxes[:, 3] = boxes[:, 3] / self.image_size[0]
-
-            boxes = np.maximum(np.minimum(boxes, 1), 0)
-            boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
-            boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
-
-            boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
-            boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
-            y = np.concatenate([boxes, y[:, -1:]], axis=-1)
-
-        img = np.array(img, dtype=np.float32)
-
-        tmp_inp = np.transpose(img / 255.0, (2, 0, 1))
-        tmp_targets = np.array(y, dtype=np.float32)
-        return tmp_inp, tmp_targets
-
-
 # DataLoader中collate_fn使用
 def yolo_dataset_collate(batch):
    images = []
@@ -261,5 +306,4 @@ def yolo_dataset_collate(batch):
        images.append(img)
        bboxes.append(box)
    images = np.array(images)
-    return images, bboxes
-
+    return images, bboxes
\ No newline at end of file
--- a/utils/utils.py
+++ b/utils/utils.py
-from __future__ import division
-
 import numpy as np
-import torch
-import torch.nn as nn
 from PIL import Image
-from torchvision.ops import nms
-
-
-class DecodeBox(nn.Module):
-    def __init__(self, anchors, num_classes, img_size):
-        super(DecodeBox, self).__init__()
-        #-----------------------------------------------------------#
-        #   13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401]
-        #   26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146]
-        #   52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28]
-        #-----------------------------------------------------------#
-        self.anchors = anchors
-        self.num_anchors = len(anchors)
-        self.num_classes = num_classes
-        self.bbox_attrs = 5 + num_classes
-        self.img_size = img_size
-
-    def forward(self, input):
-        #-----------------------------------------------#
-        #   输入的input一共有三个，他们的shape分别是
-        #   batch_size, 255, 13, 13
-        #   batch_size, 255, 26, 26
-        #   batch_size, 255, 52, 52
-        #-----------------------------------------------#
-        batch_size = input.size(0)
-        input_height = input.size(2)
-        input_width = input.size(3)
-
-        #-----------------------------------------------#
-        #   输入为416x416时
-        #   stride_h = stride_w = 32、16、8
-        #-----------------------------------------------#
-        stride_h = self.img_size[1] / input_height
-        stride_w = self.img_size[0] / input_width
-        #-------------------------------------------------#
-        #   此时获得的scaled_anchors大小是相对于特征层的
-        #-------------------------------------------------#
-        scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors]
-
-        #-----------------------------------------------#
-        #   输入的input一共有三个，他们的shape分别是
-        #   batch_size, 3, 13, 13, 85
-        #   batch_size, 3, 26, 26, 85
-        #   batch_size, 3, 52, 52, 85
-        #-----------------------------------------------#
-        prediction = input.view(batch_size, self.num_anchors,
-                                self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
-
-        # 先验框的中心位置的调整参数
-        x = torch.sigmoid(prediction[..., 0])  
-        y = torch.sigmoid(prediction[..., 1])
-        # 先验框的宽高调整参数
-        w = prediction[..., 2]
-        h = prediction[..., 3]
-        # 获得置信度，是否有物体
-        conf = torch.sigmoid(prediction[..., 4])
-        # 种类置信度
-        pred_cls = torch.sigmoid(prediction[..., 5:])
-
-        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
-        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
-
-        #----------------------------------------------------------#
-        #   生成网格，先验框中心，网格左上角 
-        #   batch_size,3,13,13
-        #----------------------------------------------------------#
-        grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
-            batch_size * self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
-        grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
-            batch_size * self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
-
-        #----------------------------------------------------------#
-        #   按照网格格式生成先验框的宽高
-        #   batch_size,3,13,13
-        #----------------------------------------------------------#
-        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
-        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
-        anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
-        anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
-
-        #----------------------------------------------------------#
-        #   利用预测结果对先验框进行调整
-        #   首先调整先验框的中心，从先验框中心向右下角偏移
-        #   再调整先验框的宽高。
-        #----------------------------------------------------------#
-        pred_boxes = FloatTensor(prediction[..., :4].shape)
-        pred_boxes[..., 0] = x.data + grid_x
-        pred_boxes[..., 1] = y.data + grid_y
-        pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
-        pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
-
-        # fig = plt.figure()
-        # ax = fig.add_subplot(121)
-        # if input_height==13:
-        #     plt.ylim(0,13)
-        #     plt.xlim(0,13)
-        # elif input_height==26:
-        #     plt.ylim(0,26)
-        #     plt.xlim(0,26)
-        # elif input_height==52:
-        #     plt.ylim(0,52)
-        #     plt.xlim(0,52)
-        # plt.scatter(grid_x.cpu(),grid_y.cpu())
-
-        # anchor_left = grid_x - anchor_w/2 
-        # anchor_top = grid_y - anchor_h/2 
-
-        # rect1 = plt.Rectangle([anchor_left[0,0,5,5],anchor_top[0,0,5,5]],anchor_w[0,0,5,5],anchor_h[0,0,5,5],color="r",fill=False)
-        # rect2 = plt.Rectangle([anchor_left[0,1,5,5],anchor_top[0,1,5,5]],anchor_w[0,1,5,5],anchor_h[0,1,5,5],color="r",fill=False)
-        # rect3 = plt.Rectangle([anchor_left[0,2,5,5],anchor_top[0,2,5,5]],anchor_w[0,2,5,5],anchor_h[0,2,5,5],color="r",fill=False)
-
-        # ax.add_patch(rect1)
-        # ax.add_patch(rect2)
-        # ax.add_patch(rect3)
-
-        # ax = fig.add_subplot(122)
-        # if input_height==13:
-        #     plt.ylim(0,13)
-        #     plt.xlim(0,13)
-        # elif input_height==26:
-        #     plt.ylim(0,26)
-        #     plt.xlim(0,26)
-        # elif input_height==52:
-        #     plt.ylim(0,52)
-        #     plt.xlim(0,52)
-        # plt.scatter(grid_x.cpu(),grid_y.cpu())
-        # plt.scatter(pred_boxes[0,:,5,5,0].cpu(),pred_boxes[0,:,5,5,1].cpu(),c='r')
-
-        # pre_left = pred_boxes[...,0] - pred_boxes[...,2]/2 
-        # pre_top = pred_boxes[...,1] - pred_boxes[...,3]/2 
-
-        # rect1 = plt.Rectangle([pre_left[0,0,5,5],pre_top[0,0,5,5]],pred_boxes[0,0,5,5,2],pred_boxes[0,0,5,5,3],color="r",fill=False)
-        # rect2 = plt.Rectangle([pre_left[0,1,5,5],pre_top[0,1,5,5]],pred_boxes[0,1,5,5,2],pred_boxes[0,1,5,5,3],color="r",fill=False)
-        # rect3 = plt.Rectangle([pre_left[0,2,5,5],pre_top[0,2,5,5]],pred_boxes[0,2,5,5,2],pred_boxes[0,2,5,5,3],color="r",fill=False)
-
-        # ax.add_patch(rect1)
-        # ax.add_patch(rect2)
-        # ax.add_patch(rect3)
-
-        # plt.show()
-
-        #----------------------------------------------------------#
-        #   将输出结果调整成相对于输入图像大小
-        #----------------------------------------------------------#
-        _scale = torch.Tensor([stride_w, stride_h] * 2).type(FloatTensor)
-        output = torch.cat((pred_boxes.view(batch_size, -1, 4) * _scale,
-                            conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
-        return output.data
-        
-def letterbox_image(image, size):
-    iw, ih = image.size
-    w, h = size
-    scale = min(w/iw, h/ih)
-    nw = int(iw*scale)
-    nh = int(ih*scale)
-
-    image = image.resize((nw,nh), Image.BICUBIC)
-    new_image = Image.new('RGB', size, (128,128,128))
-    new_image.paste(image, ((w-nw)//2, (h-nh)//2))
-    return new_image
-
-def yolo_correct_boxes(top, left, bottom, right, input_shape, image_shape):
-    new_shape = image_shape*np.min(input_shape/image_shape)
-
-    offset = (input_shape-new_shape)/2./input_shape
-    scale = input_shape/new_shape
-
-    box_yx = np.concatenate(((top+bottom)/2,(left+right)/2),axis=-1)/input_shape
-    box_hw = np.concatenate((bottom-top,right-left),axis=-1)/input_shape
-
-    box_yx = (box_yx - offset) * scale
-    box_hw *= scale

-    box_mins = box_yx - (box_hw / 2.)
-    box_maxes = box_yx + (box_hw / 2.)
-    boxes =  np.concatenate([
-        box_mins[:, 0:1],
-        box_mins[:, 1:2],
-        box_maxes[:, 0:1],
-        box_maxes[:, 1:2]
-    ],axis=-1)
-    boxes *= np.concatenate([image_shape, image_shape],axis=-1)
-    return boxes
-
-def bbox_iou(box1, box2, x1y1x2y2=True):
-    """
-        计算IOU
-    """
-    if not x1y1x2y2:
-        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
-        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
-        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
-        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+#---------------------------------------------------------#
+#   将图像转换成RGB图像，防止灰度图在预测时报错。
+#   代码仅仅支持RGB图像的预测，所有其它类型的图像都会转化成RGB
+#---------------------------------------------------------#
+def cvtColor(image):
+    if len(np.shape(image)) == 3 and np.shape(image)[-2] == 3:
+        return image 
    else:
-        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
-        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
-
-    inter_rect_x1 = torch.max(b1_x1, b2_x1)
-    inter_rect_y1 = torch.max(b1_y1, b2_y1)
-    inter_rect_x2 = torch.min(b1_x2, b2_x2)
-    inter_rect_y2 = torch.min(b1_y2, b2_y2)
-
-    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * \
-                 torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)
-                 
-    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
-    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
-
-    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
-
-    return iou
-
-
-def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
-    #----------------------------------------------------------#
-    #   将预测结果的格式转换成左上角右下角的格式。
-    #   prediction  [batch_size, num_anchors, 85]
-    #----------------------------------------------------------#
-    box_corner = prediction.new(prediction.shape)
-    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
-    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
-    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
-    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
-    prediction[:, :, :4] = box_corner[:, :, :4]
-
-    output = [None for _ in range(len(prediction))]
-    for image_i, image_pred in enumerate(prediction):
-        #----------------------------------------------------------#
-        #   对种类预测部分取max。
-        #   class_conf  [num_anchors, 1]    种类置信度
-        #   class_pred  [num_anchors, 1]    种类
-        #----------------------------------------------------------#
-        class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
-
-        #----------------------------------------------------------#
-        #   利用置信度进行第一轮筛选
-        #----------------------------------------------------------#
-        conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
-
-        #----------------------------------------------------------#
-        #   根据置信度进行预测结果的筛选
-        #----------------------------------------------------------#
-        image_pred = image_pred[conf_mask]
-        class_conf = class_conf[conf_mask]
-        class_pred = class_pred[conf_mask]
-        if not image_pred.size(0):
-            continue
-        #-------------------------------------------------------------------------#
-        #   detections  [num_anchors, 7]
-        #   7的内容为：x1, y1, x2, y2, obj_conf, class_conf, class_pred
-        #-------------------------------------------------------------------------#
-        detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
-
-        #------------------------------------------#
-        #   获得预测结果中包含的所有种类
-        #------------------------------------------#
-        unique_labels = detections[:, -1].cpu().unique()
-
-        if prediction.is_cuda:
-            unique_labels = unique_labels.cuda()
-            detections = detections.cuda()
-
-        for c in unique_labels:
-            #------------------------------------------#
-            #   获得某一类得分筛选后全部的预测结果
-            #------------------------------------------#
-            detections_class = detections[detections[:, -1] == c]
-
-            #------------------------------------------#
-            #   使用官方自带的非极大抑制会速度更快一些！
-            #------------------------------------------#
-            keep = nms(
-                detections_class[:, :4],
-                detections_class[:, 4] * detections_class[:, 5],
-                nms_thres
-            )
-            max_detections = detections_class[keep]
-            
-            # # 按照存在物体的置信度排序
-            # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
-            # detections_class = detections_class[conf_sort_index]
-            # # 进行非极大抑制
-            # max_detections = []
-            # while detections_class.size(0):
-            #     # 取出这一类置信度最高的，一步一步往下判断，判断重合程度是否大于nms_thres，如果是则去除掉
-            #     max_detections.append(detections_class[0].unsqueeze(0))
-            #     if len(detections_class) == 1:
-            #         break
-            #     ious = bbox_iou(max_detections[-1], detections_class[1:])
-            #     detections_class = detections_class[1:][ious < nms_thres]
-            # # 堆叠
-            # max_detections = torch.cat(max_detections).data
-            
-            # Add max detections to outputs
-            output[image_i] = max_detections if output[image_i] is None else torch.cat(
-                (output[image_i], max_detections))
-
-    return output
-
-
-def merge_bboxes(bboxes, cutx, cuty):
-    merge_bbox = []
-    for i in range(len(bboxes)):
-        for box in bboxes[i]:
-            tmp_box = []
-            x1,y1,x2,y2 = box[0], box[1], box[2], box[3]
-
-            if i == 0:
-                if y1 > cuty or x1 > cutx:
-                    continue
-                if y2 >= cuty and y1 <= cuty:
-                    y2 = cuty
-                    if y2-y1 < 5:
-                        continue
-                if x2 >= cutx and x1 <= cutx:
-                    x2 = cutx
-                    if x2-x1 < 5:
-                        continue
-                
-            if i == 1:
-                if y2 < cuty or x1 > cutx:
-                    continue
-
-                if y2 >= cuty and y1 <= cuty:
-                    y1 = cuty
-                    if y2-y1 < 5:
-                        continue
-                
-                if x2 >= cutx and x1 <= cutx:
-                    x2 = cutx
-                    if x2-x1 < 5:
-                        continue
-
-            if i == 2:
-                if y2 < cuty or x2 < cutx:
-                    continue
-
-                if y2 >= cuty and y1 <= cuty:
-                    y1 = cuty
-                    if y2-y1 < 5:
-                        continue
-
-                if x2 >= cutx and x1 <= cutx:
-                    x1 = cutx
-                    if x2-x1 < 5:
-                        continue
-
-            if i == 3:
-                if y1 > cuty or x2 < cutx:
-                    continue
-
-                if y2 >= cuty and y1 <= cuty:
-                    y2 = cuty
-                    if y2-y1 < 5:
-                        continue
-
-                if x2 >= cutx and x1 <= cutx:
-                    x1 = cutx
-                    if x2-x1 < 5:
-                        continue
+        image = image.convert('RGB')
+        return image 
+
+#---------------------------------------------------#
+#   对输入图像进行resize
+#---------------------------------------------------#
+def resize_image(image, size, letterbox_image):
+    iw, ih  = image.size
+    w, h    = size
+    if letterbox_image:
+        scale   = min(w/iw, h/ih)
+        nw      = int(iw*scale)
+        nh      = int(ih*scale)
+
+        image   = image.resize((nw,nh), Image.BICUBIC)
+        new_image = Image.new('RGB', size, (128,128,128))
+        new_image.paste(image, ((w-nw)//2, (h-nh)//2))
+    else:
+        new_image = image.resize((w, h), Image.BICUBIC)
+    return new_image

-            tmp_box.append(x1)
-            tmp_box.append(y1)
-            tmp_box.append(x2)
-            tmp_box.append(y2)
-            tmp_box.append(box[-1])
-            merge_bbox.append(tmp_box)
-    return merge_bbox
+#---------------------------------------------------#
+#   获得类
+#---------------------------------------------------#
+def get_classes(classes_path):
+    with open(classes_path, encoding='utf-8') as f:
+        class_names = f.readlines()
+    class_names = [c.strip() for c in class_names]
+    return class_names, len(class_names)
+
+#---------------------------------------------------#
+#   获得先验框
+#---------------------------------------------------#
+def get_anchors(anchors_path):
+    '''loads the anchors from a file'''
+    with open(anchors_path, encoding='utf-8') as f:
+        anchors = f.readline()
+    anchors = [float(x) for x in anchors.split(',')]
+    anchors = np.array(anchors).reshape(-1, 2)
+    return anchors, len(anchors)
+
+#---------------------------------------------------#
+#   获得学习率
+#---------------------------------------------------#
+def get_lr(optimizer):
+    for param_group in optimizer.param_groups:
+        return param_group['lr']
+
+def preprocess_input(image):
+    image /= 255.0
+    return image
\ No newline at end of file
--- a/utils/utils_bbox.py
+++ b/utils/utils_bbox.py
+import torch
+import torch.nn as nn
+from torchvision.ops import nms
+import numpy as np
+
+class DecodeBox():
+    def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[6,7,8], [3,4,5], [0,1,2]]):
+        super(DecodeBox, self).__init__()
+        self.anchors        = anchors
+        self.num_classes    = num_classes
+        self.bbox_attrs     = 5 + num_classes
+        self.input_shape    = input_shape
+        #-----------------------------------------------------------#
+        #   13x13的特征层对应的anchor是[142, 110],[192, 243],[459, 401]
+        #   26x26的特征层对应的anchor是[36, 75],[76, 55],[72, 146]
+        #   52x52的特征层对应的anchor是[12, 16],[19, 36],[40, 28]
+        #-----------------------------------------------------------#
+        self.anchors_mask   = anchors_mask
+
+    def decode_box(self, inputs):
+        outputs = []
+        for i, input in enumerate(inputs):
+            #-----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 255, 13, 13
+            #   batch_size, 255, 26, 26
+            #   batch_size, 255, 52, 52
+            #-----------------------------------------------#
+            batch_size      = input.size(0)
+            input_height    = input.size(2)
+            input_width     = input.size(3)
+
+            #-----------------------------------------------#
+            #   输入为416x416时
+            #   stride_h = stride_w = 32、16、8
+            #-----------------------------------------------#
+            stride_h = self.input_shape[0] / input_height
+            stride_w = self.input_shape[1] / input_width
+            #-------------------------------------------------#
+            #   此时获得的scaled_anchors大小是相对于特征层的
+            #-------------------------------------------------#
+            scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in self.anchors[self.anchors_mask[i]]]
+
+            #-----------------------------------------------#
+            #   输入的input一共有三个，他们的shape分别是
+            #   batch_size, 3, 13, 13, 85
+            #   batch_size, 3, 26, 26, 85
+            #   batch_size, 3, 52, 52, 85
+            #-----------------------------------------------#
+            prediction = input.view(batch_size, len(self.anchors_mask[i]),
+                                    self.bbox_attrs, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
+
+            #-----------------------------------------------#
+            #   先验框的中心位置的调整参数
+            #-----------------------------------------------#
+            x = torch.sigmoid(prediction[..., 0])  
+            y = torch.sigmoid(prediction[..., 1])
+            #-----------------------------------------------#
+            #   先验框的宽高调整参数
+            #-----------------------------------------------#
+            w = prediction[..., 2]
+            h = prediction[..., 3]
+            #-----------------------------------------------#
+            #   获得置信度，是否有物体
+            #-----------------------------------------------#
+            conf        = torch.sigmoid(prediction[..., 4])
+            #-----------------------------------------------#
+            #   种类置信度
+            #-----------------------------------------------#
+            pred_cls    = torch.sigmoid(prediction[..., 5:])
+
+            FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+            LongTensor  = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
+
+            #----------------------------------------------------------#
+            #   生成网格，先验框中心，网格左上角 
+            #   batch_size,3,13,13
+            #----------------------------------------------------------#
+            grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(x.shape).type(FloatTensor)
+            grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
+                batch_size * len(self.anchors_mask[i]), 1, 1).view(y.shape).type(FloatTensor)
+
+            #----------------------------------------------------------#
+            #   按照网格格式生成先验框的宽高
+            #   batch_size,3,13,13
+            #----------------------------------------------------------#
+            anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
+            anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
+            anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
+            anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
+
+            #----------------------------------------------------------#
+            #   利用预测结果对先验框进行调整
+            #   首先调整先验框的中心，从先验框中心向右下角偏移
+            #   再调整先验框的宽高。
+            #----------------------------------------------------------#
+            pred_boxes          = FloatTensor(prediction[..., :4].shape)
+            pred_boxes[..., 0]  = x.data + grid_x
+            pred_boxes[..., 1]  = y.data + grid_y
+            pred_boxes[..., 2]  = torch.exp(w.data) * anchor_w
+            pred_boxes[..., 3]  = torch.exp(h.data) * anchor_h
+
+            #----------------------------------------------------------#
+            #   将输出结果归一化成小数的形式
+            #----------------------------------------------------------#
+            _scale = torch.Tensor([input_width, input_height, input_width, input_height]).type(FloatTensor)
+            output = torch.cat((pred_boxes.view(batch_size, -1, 4) / _scale,
+                                conf.view(batch_size, -1, 1), pred_cls.view(batch_size, -1, self.num_classes)), -1)
+            outputs.append(output.data)
+        return outputs
+
+    def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
+        #-----------------------------------------------------------------#
+        #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
+        #-----------------------------------------------------------------#
+        box_yx = box_xy[..., ::-1]
+        box_hw = box_wh[..., ::-1]
+        input_shape = np.array(input_shape)
+        image_shape = np.array(image_shape)
+
+        if letterbox_image:
+            #-----------------------------------------------------------------#
+            #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
+            #   new_shape指的是宽高缩放情况
+            #-----------------------------------------------------------------#
+            new_shape = np.round(image_shape * np.min(input_shape/image_shape))
+            offset  = (input_shape - new_shape)/2./input_shape
+            scale   = input_shape/new_shape
+
+            box_yx  = (box_yx - offset) * scale
+            box_hw *= scale
+
+        box_mins    = box_yx - (box_hw / 2.)
+        box_maxes   = box_yx + (box_hw / 2.)
+        boxes  = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
+        boxes *= np.concatenate([image_shape, image_shape], axis=-1)
+        return boxes
+
+    def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4):
+        #----------------------------------------------------------#
+        #   将预测结果的格式转换成左上角右下角的格式。
+        #   prediction  [batch_size, num_anchors, 85]
+        #----------------------------------------------------------#
+        box_corner          = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+        for i, image_pred in enumerate(prediction):
+            #----------------------------------------------------------#
+            #   对种类预测部分取max。
+            #   class_conf  [num_anchors, 1]    种类置信度
+            #   class_pred  [num_anchors, 1]    种类
+            #----------------------------------------------------------#
+            class_conf, class_pred = torch.max(image_pred[:, 5:5 + num_classes], 1, keepdim=True)
+
+            #----------------------------------------------------------#
+            #   利用置信度进行第一轮筛选
+            #----------------------------------------------------------#
+            conf_mask = (image_pred[:, 4] * class_conf[:, 0] >= conf_thres).squeeze()
+
+            #----------------------------------------------------------#
+            #   根据置信度进行预测结果的筛选
+            #----------------------------------------------------------#
+            image_pred = image_pred[conf_mask]
+            class_conf = class_conf[conf_mask]
+            class_pred = class_pred[conf_mask]
+            if not image_pred.size(0):
+                continue
+            #-------------------------------------------------------------------------#
+            #   detections  [num_anchors, 7]
+            #   7的内容为：x1, y1, x2, y2, obj_conf, class_conf, class_pred
+            #-------------------------------------------------------------------------#
+            detections = torch.cat((image_pred[:, :5], class_conf.float(), class_pred.float()), 1)
+
+            #------------------------------------------#
+            #   获得预测结果中包含的所有种类
+            #------------------------------------------#
+            unique_labels = detections[:, -1].cpu().unique()
+
+            if prediction.is_cuda:
+                unique_labels = unique_labels.cuda()
+                detections = detections.cuda()
+
+            for c in unique_labels:
+                #------------------------------------------#
+                #   获得某一类得分筛选后全部的预测结果
+                #------------------------------------------#
+                detections_class = detections[detections[:, -1] == c]
+
+                #------------------------------------------#
+                #   使用官方自带的非极大抑制会速度更快一些！
+                #------------------------------------------#
+                keep = nms(
+                    detections_class[:, :4],
+                    detections_class[:, 4] * detections_class[:, 5],
+                    nms_thres
+                )
+                max_detections = detections_class[keep]
+                
+                # # 按照存在物体的置信度排序
+                # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
+                # detections_class = detections_class[conf_sort_index]
+                # # 进行非极大抑制
+                # max_detections = []
+                # while detections_class.size(0):
+                #     # 取出这一类置信度最高的，一步一步往下判断，判断重合程度是否大于nms_thres，如果是则去除掉
+                #     max_detections.append(detections_class[0].unsqueeze(0))
+                #     if len(detections_class) == 1:
+                #         break
+                #     ious = bbox_iou(max_detections[-1], detections_class[1:])
+                #     detections_class = detections_class[1:][ious < nms_thres]
+                # # 堆叠
+                # max_detections = torch.cat(max_detections).data
+                
+                # Add max detections to outputs
+                output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
+            
+            if output[i] is not None:
+                output[i]           = output[i].cpu().numpy()
+                box_xy, box_wh      = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2]
+                output[i][:, :4]    = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
+        return output
--- a/utils/utils_fit.py
+++ b/utils/utils_fit.py
+import torch
+from tqdm import tqdm
+
+from utils.utils import get_lr
+        
+def fit_one_epoch(model_train, model, yolo_loss, loss_history, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda):
+    loss        = 0
+    val_loss    = 0
+
+    model_train.train()
+    print('Start Train')
+    with tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar:
+        for iteration, batch in enumerate(gen):
+            if iteration >= epoch_step:
+                break
+
+            images, targets = batch[0], batch[1]
+            with torch.no_grad():
+                if cuda:
+                    images  = torch.from_numpy(images).type(torch.FloatTensor).cuda()
+                    targets = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in targets]
+                else:
+                    images  = torch.from_numpy(images).type(torch.FloatTensor)
+                    targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets]
+            #----------------------#
+            #   清零梯度
+            #----------------------#
+            optimizer.zero_grad()
+            #----------------------#
+            #   前向传播
+            #----------------------#
+            outputs         = model_train(images)
+
+            loss_value_all  = 0
+            num_pos_all     = 0
+            #----------------------#
+            #   计算损失
+            #----------------------#
+            for l in range(len(outputs)):
+                loss_item, num_pos = yolo_loss(l, outputs[l], targets)
+                loss_value_all  += loss_item
+                num_pos_all     += num_pos
+            loss_value = loss_value_all / num_pos_all
+
+            #----------------------#
+            #   反向传播
+            #----------------------#
+            loss_value.backward()
+            optimizer.step()
+
+            loss += loss_value.item()
+            
+            pbar.set_postfix(**{'loss'  : loss / (iteration + 1), 
+                                'lr'    : get_lr(optimizer)})
+            pbar.update(1)
+
+    print('Finish Train')
+
+    model_train.eval()
+    print('Start Validation')
+    with tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) as pbar:
+        for iteration, batch in enumerate(gen_val):
+            if iteration >= epoch_step_val:
+                break
+            images, targets = batch[0], batch[1]
+            with torch.no_grad():
+                if cuda:
+                    images  = torch.from_numpy(images).type(torch.FloatTensor).cuda()
+                    targets = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in targets]
+                else:
+                    images  = torch.from_numpy(images).type(torch.FloatTensor)
+                    targets = [torch.from_numpy(ann).type(torch.FloatTensor) for ann in targets]
+                #----------------------#
+                #   清零梯度
+                #----------------------#
+                optimizer.zero_grad()
+                #----------------------#
+                #   前向传播
+                #----------------------#
+                outputs         = model_train(images)
+
+                loss_value_all  = 0
+                num_pos_all     = 0
+                #----------------------#
+                #   计算损失
+                #----------------------#
+                for l in range(len(outputs)):
+                    loss_item, num_pos = yolo_loss(l, outputs[l], targets)
+                    loss_value_all  += loss_item
+                    num_pos_all     += num_pos
+                loss_value  = loss_value_all / num_pos_all
+
+            val_loss += loss_value.item()
+            pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
+            pbar.update(1)
+
+    print('Finish Validation')
+    
+    loss_history.append_loss(loss / epoch_step, val_loss / epoch_step_val)
+    print('Epoch:'+ str(epoch+1) + '/' + str(Epoch))
+    print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
+    torch.save(model.state_dict(), 'logs/ep%03d-loss%.3f-val_loss%.3f.pth' % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val))
--- a/utils/utils_map.py
+++ b/utils/utils_map.py
--- a/voc_annotation.py
+++ b/voc_annotation.py
-#---------------------------------------------#
-#   运行前一定要修改classes
+import os
+import random
+import xml.etree.ElementTree as ET
+
+from utils.utils import get_classes
+
+#--------------------------------------------------------------------------------------------------------------------------------#
+#   annotation_mode用于指定该文件运行时计算的内容
+#   annotation_mode为0代表整个标签处理过程，包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
+#   annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
+#   annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
+#--------------------------------------------------------------------------------------------------------------------------------#
+annotation_mode     = 0
+#-------------------------------------------------------------------#
+#   必须要修改，用于生成2007_train.txt、2007_val.txt的目标信息
+#   与训练和预测所用的classes_path一致即可
 #   如果生成的2007_train.txt里面没有目标信息
 #   那么就是因为classes没有设定正确
-#---------------------------------------------#
-import xml.etree.ElementTree as ET
-from os import getcwd
+#   仅在annotation_mode为0和2的时候有效
+#-------------------------------------------------------------------#
+classes_path        = 'model_data/voc_classes.txt'
+#--------------------------------------------------------------------------------------------------------------------------------#
+#   trainval_percent用于指定(训练集+验证集)与测试集的比例，默认情况下 (训练集+验证集):测试集 = 9:1 
+#   train_percent用于指定(训练集+验证集)中训练集与验证集的比例，默认情况下 训练集:验证集 = 9:1 
+#   仅在annotation_mode为0和1的时候有效
+#--------------------------------------------------------------------------------------------------------------------------------#
+trainval_percent    = 0.9
+train_percent       = 0.9
+#-------------------------------------------------------#
+#   指向VOC数据集所在的文件夹
+#   默认指向根目录下的VOC数据集
+#-------------------------------------------------------#
+VOCdevkit_path  = 'VOCdevkit'

-sets=[('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
-#-----------------------------------------------------#
-#   这里设定的classes顺序要和model_data里的txt一样
-#-----------------------------------------------------#
-classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+VOCdevkit_sets  = [('2007', 'train'), ('2007', 'val')]
+classes, _      = get_classes(classes_path)

 def convert_annotation(year, image_id, list_file):
-    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id), encoding='utf-8')
+    in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml'%(year, image_id)), encoding='utf-8')
    tree=ET.parse(in_file)
    root = tree.getroot()

@@ -28,14 +51,59 @@ def convert_annotation(year, image_id, list_file):
        xmlbox = obj.find('bndbox')
        b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
        list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
+        
+if __name__ == "__main__":
+    random.seed(0)
+    if annotation_mode == 0 or annotation_mode == 1:
+        print("Generate txt in ImageSets.")
+        xmlfilepath     = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
+        saveBasePath    = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
+        temp_xml        = os.listdir(xmlfilepath)
+        total_xml       = []
+        for xml in temp_xml:
+            if xml.endswith(".xml"):
+                total_xml.append(xml)
+
+        num     = len(total_xml)  
+        list    = range(num)  
+        tv      = int(num*trainval_percent)  
+        tr      = int(tv*train_percent)  
+        trainval= random.sample(list,tv)  
+        train   = random.sample(trainval,tr)  
+        
+        print("train and val size",tv)
+        print("train size",tr)
+        ftrainval   = open(os.path.join(saveBasePath,'trainval.txt'), 'w')  
+        ftest       = open(os.path.join(saveBasePath,'test.txt'), 'w')  
+        ftrain      = open(os.path.join(saveBasePath,'train.txt'), 'w')  
+        fval        = open(os.path.join(saveBasePath,'val.txt'), 'w')  
+        
+        for i in list:  
+            name=total_xml[i][:-4]+'\n'  
+            if i in trainval:  
+                ftrainval.write(name)  
+                if i in train:  
+                    ftrain.write(name)  
+                else:  
+                    fval.write(name)  
+            else:  
+                ftest.write(name)  
+        
+        ftrainval.close()  
+        ftrain.close()  
+        fval.close()  
+        ftest.close()
+        print("Generate txt in ImageSets done.")

-wd = getcwd()
+    if annotation_mode == 0 or annotation_mode == 2:
+        print("Generate 2007_train.txt and 2007_val.txt for train.")
+        for year, image_set in VOCdevkit_sets:
+            image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt'%(year, image_set)), encoding='utf-8').read().strip().split()
+            list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
+            for image_id in image_ids:
+                list_file.write('%s/VOC%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), year, image_id))

-for year, image_set in sets:
-    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set), encoding='utf-8').read().strip().split()
-    list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
-    for image_id in image_ids:
-        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg'%(wd, year, image_id))
-        convert_annotation(year, image_id, list_file)
-        list_file.write('\n')
-    list_file.close()
+                convert_annotation(year, image_id, list_file)
+                list_file.write('\n')
+            list_file.close()
+        print("Generate 2007_train.txt and 2007_val.txt for train done.")
--- a/yolo.py
+++ b/yolo.py