From 5b6f4c01c81528c2ccfe4bd2aceca93c84220d6c Mon Sep 17 00:00:00 2001
From: Bubbliiiing <47347516+bubbliiiing@users.noreply.github.com>
Date: Thu, 14 Jan 2021 15:36:58 +0800
Subject: [PATCH] Add files via upload

---
 VOCdevkit/VOC2007/voc2yolo4.py |  11 +-
 get_dr_txt.py                  |  53 +++++---
 kmeans_for_anchors.py          |  11 +-
 nets/CSPdarknet53.py           |  43 +++++--
 nets/ious.py                   |  33 +++--
 nets/loss.py                   | 182 ++++++++++++++++++---------
 nets/yolo4.py                  | 168 ++++++++++++++++++-------
 predict.py                     |  12 +-
 test.py                        |  15 ++-
 train.py                       | 217 ++++++++++++++++++++++-----------
 utils/utils.py                 |  50 ++++++--
 video.py                       |  28 +++--
 vision_for_anchors.py          |   6 +-
 voc_annotation.py              |   5 +
 yolo.py                        |  63 ++++++----
 15 files changed, 632 insertions(+), 265 deletions(-)

diff --git a/VOCdevkit/VOC2007/voc2yolo4.py b/VOCdevkit/VOC2007/voc2yolo4.py
index 22e40c7..e3c8214 100644
--- a/VOCdevkit/VOC2007/voc2yolo4.py
+++ b/VOCdevkit/VOC2007/voc2yolo4.py
@@ -1,9 +1,18 @@
+#----------------------------------------------------------------------#
+#   验证集的划分在train.py代码里面进行
+#   test.txt和val.txt里面没有内容是正常的。训练不会使用到。
+#----------------------------------------------------------------------#
 import os
 import random 
- 
+random.seed(0)
+
 xmlfilepath=r'./VOCdevkit/VOC2007/Annotations'
 saveBasePath=r"./VOCdevkit/VOC2007/ImageSets/Main/"
  
+#----------------------------------------------------------------------#
+#   想要增加测试集修改trainval_percent
+#   train_percent不需要修改
+#----------------------------------------------------------------------#
 trainval_percent=1
 train_percent=1
 
diff --git a/get_dr_txt.py b/get_dr_txt.py
index 7253a74..d7e666a 100644
--- a/get_dr_txt.py
+++ b/get_dr_txt.py
@@ -3,18 +3,21 @@
 #   具体视频教程可查看
 #   https://www.bilibili.com/video/BV1zE411u7Vw
 #----------------------------------------------------#
-from yolo import YOLO
-from PIL import Image
-from keras.layers import Input
-from keras.applications.imagenet_utils import preprocess_input
-from keras import backend as K
-from utils.utils import letterbox_image
-from nets.yolo4 import yolo_body,yolo_eval
-from tqdm import tqdm
 import colorsys
-import numpy as np
 import os
 
+import numpy as np
+from keras import backend as K
+from keras.applications.imagenet_utils import preprocess_input
+from keras.layers import Input
+from PIL import Image
+from tqdm import tqdm
+
+from nets.yolo4 import yolo_body, yolo_eval
+from utils.utils import letterbox_image
+from yolo import YOLO
+
+
 class mAP_YOLO(YOLO):
     #---------------------------------------------------#
     #   获得所有的分类
@@ -25,12 +28,16 @@ class mAP_YOLO(YOLO):
         model_path = os.path.expanduser(self.model_path)
         assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
         
-        # 计算anchor数量
+        #---------------------------------------------------#
+        #   计算先验框的数量和种类的数量
+        #---------------------------------------------------#
         num_anchors = len(self.anchors)
         num_classes = len(self.class_names)
 
-        # 载入模型，如果原来的模型里已经包括了模型结构则直接载入。
-        # 否则先构建模型再载入
+        #---------------------------------------------------------#
+        #   载入模型，如果原来的模型里已经包括了模型结构则直接载入。
+        #   否则先构建模型再载入
+        #---------------------------------------------------------#
         try:
             self.yolo_model = load_model(model_path, compile=False)
         except:
@@ -58,6 +65,10 @@ class mAP_YOLO(YOLO):
 
         self.input_image_shape = K.placeholder(shape=(2, ))
 
+        #---------------------------------------------------------#
+        #   在yolo_eval函数中，我们会对预测结果进行后处理
+        #   后处理的内容包括，解码、非极大抑制、门限筛选等
+        #---------------------------------------------------------#
         boxes, scores, classes = yolo_eval(self.yolo_model.output, self.anchors,
                 num_classes, self.input_image_shape, max_boxes = self.max_boxes,
                 score_threshold = self.score, iou_threshold = self.iou)
@@ -68,21 +79,27 @@ class mAP_YOLO(YOLO):
     #---------------------------------------------------#
     def detect_image(self, image_id, image):
         f = open("./input/detection-results/"+image_id+".txt","w") 
-        # 调整图片使其符合输入要求
+        #---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #---------------------------------------------------------#
         new_image_size = (self.model_image_size[1],self.model_image_size[0])
         boxed_image = letterbox_image(image, new_image_size)
         image_data = np.array(boxed_image, dtype='float32')
         image_data /= 255.
-        image_data = np.expand_dims(image_data, 0)  # Add batch dimension.
-
-        # 预测结果
+        #---------------------------------------------------------#
+        #   添加上batch_size维度
+        #---------------------------------------------------------#
+        image_data = np.expand_dims(image_data, 0)
+
+        #---------------------------------------------------------#
+        #   将图像输入网络当中进行预测！
+        #---------------------------------------------------------#
         out_boxes, out_scores, out_classes = self.sess.run(
             [self.boxes, self.scores, self.classes],
             feed_dict={
                 self.yolo_model.input: image_data,
                 self.input_image_shape: [image.size[1], image.size[0]],
-                K.learning_phase(): 0
-            })
+                K.learning_phase(): 0})
 
         for i, c in enumerate(out_classes):
             predicted_class = self.class_names[int(c)]
diff --git a/kmeans_for_anchors.py b/kmeans_for_anchors.py
index 98c3650..2dcbbc0 100644
--- a/kmeans_for_anchors.py
+++ b/kmeans_for_anchors.py
@@ -1,7 +1,9 @@
-import numpy as np
-import xml.etree.ElementTree as ET
 import glob
 import random
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
 
 def cas_iou(box,cluster):
     x = np.minimum(cluster[:,0],box[0])
@@ -61,6 +63,9 @@ def load_data(path):
         tree = ET.parse(xml_file)
         height = int(tree.findtext('./size/height'))
         width = int(tree.findtext('./size/width'))
+        if height<=0 or width<=0:
+            continue
+        
         # 对于每一个目标都获得它的宽高
         for obj in tree.iter('object'):
             xmin = int(float(obj.findtext('bndbox/xmin'))) / width
@@ -103,4 +108,4 @@ if __name__ == '__main__':
         else:
             x_y = ", %d,%d" % (data[i][0], data[i][1])
         f.write(x_y)
-    f.close()
\ No newline at end of file
+    f.close()
diff --git a/nets/CSPdarknet53.py b/nets/CSPdarknet53.py
index 5bb7c5f..a838946 100644
--- a/nets/CSPdarknet53.py
+++ b/nets/CSPdarknet53.py
@@ -1,6 +1,8 @@
 from functools import wraps
+
 from keras import backend as K
-from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, MaxPooling2D, Layer
+from keras.layers import (Add, Concatenate, Conv2D, Layer, MaxPooling2D,
+                          UpSampling2D, ZeroPadding2D)
 from keras.layers.advanced_activations import LeakyReLU
 from keras.layers.normalization import BatchNormalization
 from keras.regularizers import l2
@@ -21,8 +23,11 @@ class Mish(Layer):
 
     def compute_output_shape(self, input_shape):
         return input_shape
+
 #--------------------------------------------------#
-#   单次卷积
+#   单次卷积DarknetConv2D
+#   正则化系数为5e-4
+#   如果步长为2则自己设定padding方式。
 #--------------------------------------------------#
 @wraps(Conv2D)
 def DarknetConv2D(*args, **kwargs):
@@ -32,7 +37,7 @@ def DarknetConv2D(*args, **kwargs):
     return Conv2D(*args, **darknet_conv_kwargs)
 
 #---------------------------------------------------#
-#   卷积块
+#   卷积块 -> 卷积 + 标准化 + 激活函数
 #   DarknetConv2D + BatchNormalization + Mish
 #---------------------------------------------------#
 def DarknetConv2D_BN_Mish(*args, **kwargs):
@@ -43,36 +48,48 @@ def DarknetConv2D_BN_Mish(*args, **kwargs):
         BatchNormalization(),
         Mish())
 
-#---------------------------------------------------#
+#--------------------------------------------------------------------#
 #   CSPdarknet的结构块
-#   存在一个大残差边
-#   这个大残差边绕过了很多的残差结构
-#---------------------------------------------------#
+#   首先利用ZeroPadding2D和一个步长为2x2的卷积块进行高和宽的压缩
+#   然后建立一个大的残差边shortconv、这个大残差边绕过了很多的残差结构
+#   主干部分会对num_blocks进行循环，循环内部是残差结构。
+#   对于整个CSPdarknet的结构块，就是一个大残差块+内部多个小残差块
+#--------------------------------------------------------------------#
 def resblock_body(x, num_filters, num_blocks, all_narrow=True):
-    # 进行长和宽的压缩
+    #----------------------------------------------------------------#
+    #   利用ZeroPadding2D和一个步长为2x2的卷积块进行高和宽的压缩
+    #----------------------------------------------------------------#
     preconv1 = ZeroPadding2D(((1,0),(1,0)))(x)
     preconv1 = DarknetConv2D_BN_Mish(num_filters, (3,3), strides=(2,2))(preconv1)
 
-    # 生成一个大的残差边 
+    #--------------------------------------------------------------------#
+    #   然后建立一个大的残差边shortconv、这个大残差边绕过了很多的残差结构
+    #--------------------------------------------------------------------#
     shortconv = DarknetConv2D_BN_Mish(num_filters//2 if all_narrow else num_filters, (1,1))(preconv1)
 
-    # 主干部分的卷积
+    #----------------------------------------------------------------#
+    #   主干部分会对num_blocks进行循环，循环内部是残差结构。
+    #----------------------------------------------------------------#
     mainconv = DarknetConv2D_BN_Mish(num_filters//2 if all_narrow else num_filters, (1,1))(preconv1)
-    # 1x1卷积对通道数进行整合->3x3卷积提取特征，使用残差结构
     for i in range(num_blocks):
         y = compose(
                 DarknetConv2D_BN_Mish(num_filters//2, (1,1)),
                 DarknetConv2D_BN_Mish(num_filters//2 if all_narrow else num_filters, (3,3)))(mainconv)
         mainconv = Add()([mainconv,y])
-    # 1x1卷积后和残差边堆叠
     postconv = DarknetConv2D_BN_Mish(num_filters//2 if all_narrow else num_filters, (1,1))(mainconv)
+
+    #----------------------------------------------------------------#
+    #   将大残差边再堆叠回来
+    #----------------------------------------------------------------#
     route = Concatenate()([postconv, shortconv])
 
     # 最后对通道数进行整合
     return DarknetConv2D_BN_Mish(num_filters, (1,1))(route)
 
 #---------------------------------------------------#
-#   darknet53 的主体部分
+#   CSPdarknet53 的主体部分
+#   输入为一张416x416x3的图片
+#   输出为三个有效特征层
 #---------------------------------------------------#
 def darknet_body(x):
     x = DarknetConv2D_BN_Mish(32, (3,3))(x)
diff --git a/nets/ious.py b/nets/ious.py
index 1f7fc39..a0c7a3f 100644
--- a/nets/ious.py
+++ b/nets/ious.py
@@ -12,20 +12,31 @@ def box_ciou(b1, b2):
     -------
     ciou: tensor, shape=(batch, feat_w, feat_h, anchor_num, 1)
     """
-    # 求出预测框左上角右下角
+    #-----------------------------------------------------------#
+    #   求出预测框左上角右下角
+    #   b1_mins     (batch, feat_w, feat_h, anchor_num, 2)
+    #   b1_maxes    (batch, feat_w, feat_h, anchor_num, 2)
+    #-----------------------------------------------------------#
     b1_xy = b1[..., :2]
     b1_wh = b1[..., 2:4]
     b1_wh_half = b1_wh/2.
     b1_mins = b1_xy - b1_wh_half
     b1_maxes = b1_xy + b1_wh_half
-    # 求出真实框左上角右下角
+    #-----------------------------------------------------------#
+    #   求出真实框左上角右下角
+    #   b2_mins     (batch, feat_w, feat_h, anchor_num, 2)
+    #   b2_maxes    (batch, feat_w, feat_h, anchor_num, 2)
+    #-----------------------------------------------------------#
     b2_xy = b2[..., :2]
     b2_wh = b2[..., 2:4]
     b2_wh_half = b2_wh/2.
     b2_mins = b2_xy - b2_wh_half
     b2_maxes = b2_xy + b2_wh_half
 
-    # 求真实框和预测框所有的iou
+    #-----------------------------------------------------------#
+    #   求真实框和预测框所有的iou
+    #   iou         (batch, feat_w, feat_h, anchor_num)
+    #-----------------------------------------------------------#
     intersect_mins = K.maximum(b1_mins, b2_mins)
     intersect_maxes = K.minimum(b1_maxes, b2_maxes)
     intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
@@ -33,21 +44,27 @@ def box_ciou(b1, b2):
     b1_area = b1_wh[..., 0] * b1_wh[..., 1]
     b2_area = b2_wh[..., 0] * b2_wh[..., 1]
     union_area = b1_area + b2_area - intersect_area
-    iou = intersect_area / K.maximum(union_area,K.epsilon())
+    iou = intersect_area / K.maximum(union_area, K.epsilon())
 
-    # 计算中心的差距
+    #-----------------------------------------------------------#
+    #   计算中心的差距
+    #   center_distance (batch, feat_w, feat_h, anchor_num)
+    #-----------------------------------------------------------#
     center_distance = K.sum(K.square(b1_xy - b2_xy), axis=-1)
-    # 找到包裹两个框的最小框的左上角和右下角
     enclose_mins = K.minimum(b1_mins, b2_mins)
     enclose_maxes = K.maximum(b1_maxes, b2_maxes)
     enclose_wh = K.maximum(enclose_maxes - enclose_mins, 0.0)
-    # 计算对角线距离
+    #-----------------------------------------------------------#
+    #   计算对角线距离
+    #   enclose_diagonal (batch, feat_w, feat_h, anchor_num)
+    #-----------------------------------------------------------#
     enclose_diagonal = K.sum(K.square(enclose_wh), axis=-1)
     ciou = iou - 1.0 * (center_distance) / K.maximum(enclose_diagonal ,K.epsilon())
     
-    v = 4*K.square(tf.math.atan2(b1_wh[..., 0], K.maximum(b1_wh[..., 1],K.epsilon())) - tf.math.atan2(b2_wh[..., 0], K.maximum(b2_wh[..., 1],K.epsilon()))) / (math.pi * math.pi)
+    v = 4 * K.square(tf.math.atan2(b1_wh[..., 0], K.maximum(b1_wh[..., 1], K.epsilon())) - tf.math.atan2(b2_wh[..., 0], K.maximum(b2_wh[..., 1],K.epsilon()))) / (math.pi * math.pi)
     alpha = v /  K.maximum((1.0 - iou + v), K.epsilon())
     ciou = ciou - alpha * v
 
     ciou = K.expand_dims(ciou, -1)
+    ciou = tf.where(tf.is_nan(ciou), tf.zeros_like(ciou), ciou)
     return ciou
diff --git a/nets/loss.py b/nets/loss.py
index 8de636c..4c839b1 100644
--- a/nets/loss.py
+++ b/nets/loss.py
@@ -1,6 +1,7 @@
 import numpy as np
 import tensorflow as tf
 from keras import backend as K
+
 from nets.ious import box_ciou
 
 #---------------------------------------------------#
@@ -10,17 +11,22 @@ def _smooth_labels(y_true, label_smoothing):
     num_classes = tf.cast(K.shape(y_true)[-1], dtype=K.floatx())
     label_smoothing = K.constant(label_smoothing, dtype=K.floatx())
     return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes
+    
 #---------------------------------------------------#
 #   将预测值的每个特征层调成真实值
 #---------------------------------------------------#
 def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
     num_anchors = len(anchors)
-    # [1, 1, 1, num_anchors, 2]
+    #---------------------------------------------------#
+    #   [1, 1, 1, num_anchors, 2]
+    #---------------------------------------------------#
     anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2])
 
-    # 获得x，y的网格
-    # (13, 13, 1, 2)
-    grid_shape = K.shape(feats)[1:3] # height, width
+    #---------------------------------------------------#
+    #   获得x，y的网格
+    #   (13, 13, 1, 2)
+    #---------------------------------------------------#
+    grid_shape = K.shape(feats)[1:3]
     grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),
         [1, grid_shape[1], 1, 1])
     grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),
@@ -28,22 +34,34 @@ def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
     grid = K.concatenate([grid_x, grid_y])
     grid = K.cast(grid, K.dtype(feats))
 
-    # (batch_size,13,13,3,85)
+    #---------------------------------------------------#
+    #   将预测结果调整成(batch_size,13,13,3,85)
+    #   85可拆分成4 + 1 + 80
+    #   4代表的是中心宽高的调整参数
+    #   1代表的是框的置信度
+    #   80代表的是种类的置信度
+    #---------------------------------------------------#
     feats = K.reshape(feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5])
 
-    # 将预测值调成真实值
-    # box_xy对应框的中心点
-    # box_wh对应框的宽和高
+    #---------------------------------------------------#
+    #   将预测值调成真实值
+    #   box_xy对应框的中心点
+    #   box_wh对应框的宽和高
+    #---------------------------------------------------#
     box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats))
     box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats))
     box_confidence = K.sigmoid(feats[..., 4:5])
     box_class_probs = K.sigmoid(feats[..., 5:])
 
-    # 在计算loss的时候返回如下参数
+    #---------------------------------------------------------------------#
+    #   在计算loss的时候返回grid, feats, box_xy, box_wh
+    #   在预测的时候返回box_xy, box_wh, box_confidence, box_class_probs
+    #---------------------------------------------------------------------#
     if calc_loss == True:
         return grid, feats, box_xy, box_wh
     return box_xy, box_wh, box_confidence, box_class_probs
 
+
 #---------------------------------------------------#
 #   用于计算每个预测框与真实框的iou
 #---------------------------------------------------#
@@ -77,108 +95,162 @@ def box_iou(b1, b2):
 
     return iou
 
-
 #---------------------------------------------------#
 #   loss值计算
 #---------------------------------------------------#
-def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False):
-
+def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0.1, print_loss=False, normalize=True):
     # 一共有三层
     num_layers = len(anchors)//3 
 
-    # 将预测结果和实际ground truth分开，args是[*model_body.output, *y_true]
-    # y_true是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
-    # yolo_outputs是一个列表，包含三个特征层，shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。
+    #---------------------------------------------------------------------------------------------------#
+    #   将预测结果和实际ground truth分开，args是[*model_body.output, *y_true]
+    #   y_true是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
+    #   yolo_outputs是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
+    #---------------------------------------------------------------------------------------------------#
     y_true = args[num_layers:]
     yolo_outputs = args[:num_layers]
 
-    # 先验框
-    # 678为142,110,  192,243,  459,401
-    # 345为36,75,  76,55,  72,146
-    # 012为12,16,  19,36,  40,28  
+    #-----------------------------------------------------------#
+    #   13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401]
+    #   26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146]
+    #   52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28]
+    #-----------------------------------------------------------#
     anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
 
-    # 得到input_shpae为608,608 
+    # 得到input_shpae为416,416 
     input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
 
     loss = 0
-
-    # 取出每一张图片
-    # m的值就是batch_size
+    num_pos = 0
+    #-----------------------------------------------------------#
+    #   取出每一张图片
+    #   m的值就是batch_size
+    #-----------------------------------------------------------#
     m = K.shape(yolo_outputs[0])[0]
     mf = K.cast(m, K.dtype(yolo_outputs[0]))
 
-    # y_true是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
-    # yolo_outputs是一个列表，包含三个特征层，shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。
+    #---------------------------------------------------------------------------------------------------#
+    #   y_true是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
+    #   yolo_outputs是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
+    #---------------------------------------------------------------------------------------------------#
     for l in range(num_layers):
-        # 以第一个特征层(m,13,13,3,85)为例子
-        # 取出该特征层中存在目标的点的位置。(m,13,13,3,1)
+        #-----------------------------------------------------------#
+        #   以第一个特征层(m,13,13,3,85)为例子
+        #   取出该特征层中存在目标的点的位置。(m,13,13,3,1)
+        #-----------------------------------------------------------#
         object_mask = y_true[l][..., 4:5]
-        # 取出其对应的种类(m,13,13,3,80)
+        #-----------------------------------------------------------#
+        #   取出其对应的种类(m,13,13,3,80)
+        #-----------------------------------------------------------#
         true_class_probs = y_true[l][..., 5:]
         if label_smoothing:
             true_class_probs = _smooth_labels(true_class_probs, label_smoothing)
 
-        # 将yolo_outputs的特征层输出进行处理
-        # grid为网格结构(13,13,1,2)，raw_pred为尚未处理的预测结果(m,13,13,3,85)
-        # 还有解码后的xy，wh，(m,13,13,3,2)
+        #-----------------------------------------------------------#
+        #   将yolo_outputs的特征层输出进行处理、获得四个返回值
+        #   其中：
+        #   grid        (13,13,1,2) 网格坐标
+        #   raw_pred    (m,13,13,3,85) 尚未处理的预测结果
+        #   pred_xy     (m,13,13,3,2) 解码后的中心坐标
+        #   pred_wh     (m,13,13,3,2) 解码后的宽高坐标
+        #-----------------------------------------------------------#
         grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
              anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True)
         
-        # 这个是解码后的预测的box的位置
-        # (m,13,13,3,4)
+        #-----------------------------------------------------------#
+        #   pred_box是解码后的预测的box的位置
+        #   (m,13,13,3,4)
+        #-----------------------------------------------------------#
         pred_box = K.concatenate([pred_xy, pred_wh])
 
-        # 找到负样本群组，第一步是创建一个数组，[]
+        #-----------------------------------------------------------#
+        #   找到负样本群组，第一步是创建一个数组，[]
+        #-----------------------------------------------------------#
         ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
         object_mask_bool = K.cast(object_mask, 'bool')
         
-        # 对每一张图片计算ignore_mask
+        #-----------------------------------------------------------#
+        #   对每一张图片计算ignore_mask
+        #-----------------------------------------------------------#
         def loop_body(b, ignore_mask):
-            # 取出第b副图内，真实存在的所有的box的参数
-            # n,4
+            #-----------------------------------------------------------#
+            #   取出n个真实框：n,4
+            #-----------------------------------------------------------#
             true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0])
-            # 计算预测结果与真实情况的iou
-            # pred_box为13,13,3,4
-            # 计算的结果是每个pred_box和其它所有真实框的iou
-            # 13,13,3,n
+            #-----------------------------------------------------------#
+            #   计算预测框与真实框的iou
+            #   pred_box    13,13,3,4 预测框的坐标
+            #   true_box    n,4 真实框的坐标
+            #   iou         13,13,3,n 预测框和真实框的iou
+            #-----------------------------------------------------------#
             iou = box_iou(pred_box[b], true_box)
 
-            # 13,13,3
+            #-----------------------------------------------------------#
+            #   best_iou    13,13,3 每个特征点与真实框的最大重合程度
+            #-----------------------------------------------------------#
             best_iou = K.max(iou, axis=-1)
 
-            # 如果某些预测框和真实框的重合程度大于0.5，则忽略。
+            #-----------------------------------------------------------#
+            #   判断预测框和真实框的最大iou小于ignore_thresh
+            #   则认为该预测框没有与之对应的真实框
+            #   该操作的目的是：
+            #   忽略预测结果与真实框非常对应特征点，因为这些框已经比较准了
+            #   不适合当作负样本，所以忽略掉。
+            #-----------------------------------------------------------#
             ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box)))
             return b+1, ignore_mask
 
-        # 遍历所有的图片
+        #-----------------------------------------------------------#
+        #   在这个地方进行一个循环、循环是对每一张图片进行的
+        #-----------------------------------------------------------#
         _, ignore_mask = K.control_flow_ops.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask])
 
-        # 将每幅图的内容压缩，进行处理
+        #-----------------------------------------------------------#
+        #   ignore_mask用于提取出作为负样本的特征点
+        #   (m,13,13,3)
+        #-----------------------------------------------------------#
         ignore_mask = ignore_mask.stack()
-        #(m,13,13,3,1)
+        #   (m,13,13,3,1)
         ignore_mask = K.expand_dims(ignore_mask, -1)
 
+        #-----------------------------------------------------------#
+        #   真实框越大，比重越小，小框的比重更大。
+        #-----------------------------------------------------------#
         box_loss_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4]
 
-        # Calculate ciou loss as location loss
+        #-----------------------------------------------------------#
+        #   计算Ciou loss
+        #-----------------------------------------------------------#
         raw_true_box = y_true[l][...,0:4]
         ciou = box_ciou(pred_box, raw_true_box)
         ciou_loss = object_mask * box_loss_scale * (1 - ciou)
-        ciou_loss = K.sum(ciou_loss) / mf
-        location_loss = ciou_loss
         
-        # 如果该位置本来有框，那么计算1与置信度的交叉熵
-        # 如果该位置本来没有框，而且满足best_iou<ignore_thresh，则被认定为负样本
-        # best_iou<ignore_thresh用于限制负样本数量
+        #------------------------------------------------------------------------------#
+        #   如果该位置本来有框，那么计算1与置信度的交叉熵
+        #   如果该位置本来没有框，那么计算0与置信度的交叉熵
+        #   在这其中会忽略一部分样本，这些被忽略的样本满足条件best_iou<ignore_thresh
+        #   该操作的目的是：
+        #   忽略预测结果与真实框非常对应特征点，因为这些框已经比较准了
+        #   不适合当作负样本，所以忽略掉。
+        #------------------------------------------------------------------------------#
         confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
             (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask
         
         class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True)
 
-        confidence_loss = K.sum(confidence_loss) / mf
-        class_loss = K.sum(class_loss) / mf
+        location_loss = K.sum(tf.where(tf.is_nan(ciou_loss), tf.zeros_like(ciou_loss), ciou_loss))
+        confidence_loss = K.sum(tf.where(tf.is_nan(confidence_loss), tf.zeros_like(confidence_loss), confidence_loss))
+        class_loss = K.sum(tf.where(tf.is_nan(class_loss), tf.zeros_like(class_loss), class_loss))
+        #-----------------------------------------------------------#
+        #   计算正样本数量
+        #-----------------------------------------------------------#
+        num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1)
         loss += location_loss + confidence_loss + class_loss
         # if print_loss:
-        #loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
+        #   loss = tf.Print(loss, [loss, location_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
+        
+    if normalize:
+        loss = loss / num_pos
+    else:
+        loss = loss / mf
     return loss
diff --git a/nets/yolo4.py b/nets/yolo4.py
index ad50737..b5bc0aa 100644
--- a/nets/yolo4.py
+++ b/nets/yolo4.py
@@ -3,17 +3,21 @@ from functools import wraps
 import numpy as np
 import tensorflow as tf
 from keras import backend as K
-from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, MaxPooling2D
+from keras.layers import (Add, Concatenate, Conv2D, MaxPooling2D, UpSampling2D,
+                          ZeroPadding2D)
 from keras.layers.advanced_activations import LeakyReLU
 from keras.layers.normalization import BatchNormalization
 from keras.models import Model
 from keras.regularizers import l2
-from nets.CSPdarknet53 import darknet_body
 from utils.utils import compose
 
+from nets.CSPdarknet53 import darknet_body
+
 
 #--------------------------------------------------#
-#   单次卷积
+#   单次卷积DarknetConv2D
+#   正则化系数为5e-4
+#   如果步长为2则自己设定padding方式。
 #--------------------------------------------------#
 @wraps(Conv2D)
 def DarknetConv2D(*args, **kwargs):
@@ -23,7 +27,7 @@ def DarknetConv2D(*args, **kwargs):
     return Conv2D(*args, **darknet_conv_kwargs)
 
 #---------------------------------------------------#
-#   卷积块
+#   卷积块 -> 卷积 + 标准化 + 激活函数
 #   DarknetConv2D + BatchNormalization + LeakyReLU
 #---------------------------------------------------#
 def DarknetConv2D_BN_Leaky(*args, **kwargs):
@@ -35,7 +39,7 @@ def DarknetConv2D_BN_Leaky(*args, **kwargs):
         LeakyReLU(alpha=0.1))
 
 #---------------------------------------------------#
-#   特征层->最后的输出
+#   进行五次卷积
 #---------------------------------------------------#
 def make_five_convs(x, num_filters):
     # 五次卷积
@@ -47,14 +51,19 @@ def make_five_convs(x, num_filters):
     return x
 
 #---------------------------------------------------#
-#   特征层->最后的输出
+#   Panet网络的构建，并且获得预测结果
 #---------------------------------------------------#
 def yolo_body(inputs, num_anchors, num_classes):
-    # 生成darknet53的主干模型
+    #---------------------------------------------------#   
+    #   生成CSPdarknet53的主干模型
+    #   获得三个有效特征层，他们的shape分别是：
+    #   52,52,256
+    #   26,26,512
+    #   13,13,1024
+    #---------------------------------------------------#
     feat1,feat2,feat3 = darknet_body(inputs)
 
-    # 第一个特征层
-    # y1=(batch_size,13,13,3,85)
+    # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,2048 -> 13,13,512 -> 13,13,1024 -> 13,13,512
     P5 = DarknetConv2D_BN_Leaky(512, (1,1))(feat3)
     P5 = DarknetConv2D_BN_Leaky(1024, (3,3))(P5)
     P5 = DarknetConv2D_BN_Leaky(512, (1,1))(P5)
@@ -67,38 +76,60 @@ def yolo_body(inputs, num_anchors, num_classes):
     P5 = DarknetConv2D_BN_Leaky(1024, (3,3))(P5)
     P5 = DarknetConv2D_BN_Leaky(512, (1,1))(P5)
 
+    # 13,13,512 -> 13,13,256 -> 26,26,256
     P5_upsample = compose(DarknetConv2D_BN_Leaky(256, (1,1)), UpSampling2D(2))(P5)
-    
+    # 26,26,512 -> 26,26,256
     P4 = DarknetConv2D_BN_Leaky(256, (1,1))(feat2)
+    # 26,26,256 + 26,26,256 -> 26,26,512
     P4 = Concatenate()([P4, P5_upsample])
+    
+    # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
     P4 = make_five_convs(P4,256)
 
+    # 26,26,256 -> 26,26,128 -> 52,52,128
     P4_upsample = compose(DarknetConv2D_BN_Leaky(128, (1,1)), UpSampling2D(2))(P4)
-    
+    # 52,52,256 -> 52,52,128
     P3 = DarknetConv2D_BN_Leaky(128, (1,1))(feat1)
+    # 52,52,128 + 52,52,128 -> 52,52,256
     P3 = Concatenate()([P3, P4_upsample])
-    P3 = make_five_convs(P3,128)
 
+    # 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128 -> 52,52,256 -> 52,52,128
+    P3 = make_five_convs(P3,128)
+    
+    #---------------------------------------------------#
+    #   第三个特征层
+    #   y3=(batch_size,52,52,3,85)
+    #---------------------------------------------------#
     P3_output = DarknetConv2D_BN_Leaky(256, (3,3))(P3)
     P3_output = DarknetConv2D(num_anchors*(num_classes+5), (1,1))(P3_output)
 
-    #26,26 output
+    # 52,52,128 -> 26,26,256
     P3_downsample = ZeroPadding2D(((1,0),(1,0)))(P3)
     P3_downsample = DarknetConv2D_BN_Leaky(256, (3,3), strides=(2,2))(P3_downsample)
+    # 26,26,256 + 26,26,256 -> 26,26,512
     P4 = Concatenate()([P3_downsample, P4])
+    # 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256 -> 26,26,512 -> 26,26,256
     P4 = make_five_convs(P4,256)
     
+    #---------------------------------------------------#
+    #   第二个特征层
+    #   y2=(batch_size,26,26,3,85)
+    #---------------------------------------------------#
     P4_output = DarknetConv2D_BN_Leaky(512, (3,3))(P4)
     P4_output = DarknetConv2D(num_anchors*(num_classes+5), (1,1))(P4_output)
     
-
-    #13,13 output
+    # 26,26,256 -> 13,13,512
     P4_downsample = ZeroPadding2D(((1,0),(1,0)))(P4)
     P4_downsample = DarknetConv2D_BN_Leaky(512, (3,3), strides=(2,2))(P4_downsample)
+    # 13,13,512 + 13,13,512 -> 13,13,1024
     P5 = Concatenate()([P4_downsample, P5])
+    # 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512 -> 13,13,1024 -> 13,13,512
     P5 = make_five_convs(P5,512)
     
-
+    #---------------------------------------------------#
+    #   第一个特征层
+    #   y1=(batch_size,13,13,3,85)
+    #---------------------------------------------------#
     P5_output = DarknetConv2D_BN_Leaky(1024, (3,3))(P5)
     P5_output = DarknetConv2D(num_anchors*(num_classes+5), (1,1))(P5_output)
 
@@ -109,12 +140,16 @@ def yolo_body(inputs, num_anchors, num_classes):
 #---------------------------------------------------#
 def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
     num_anchors = len(anchors)
-    # [1, 1, 1, num_anchors, 2]
+    #---------------------------------------------------#
+    #   [1, 1, 1, num_anchors, 2]
+    #---------------------------------------------------#
     anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2])
 
-    # 获得x，y的网格
-    # (13,13, 1, 2)
-    grid_shape = K.shape(feats)[1:3] # height, width
+    #---------------------------------------------------#
+    #   获得x，y的网格
+    #   (13, 13, 1, 2)
+    #---------------------------------------------------#
+    grid_shape = K.shape(feats)[1:3]
     grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),
         [1, grid_shape[1], 1, 1])
     grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),
@@ -122,18 +157,29 @@ def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
     grid = K.concatenate([grid_x, grid_y])
     grid = K.cast(grid, K.dtype(feats))
 
-    # (batch_size,13,13,3,85)
+    #---------------------------------------------------#
+    #   将预测结果调整成(batch_size,13,13,3,85)
+    #   85可拆分成4 + 1 + 80
+    #   4代表的是中心宽高的调整参数
+    #   1代表的是框的置信度
+    #   80代表的是种类的置信度
+    #---------------------------------------------------#
     feats = K.reshape(feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5])
 
-    # 将预测值调成真实值
-    # box_xy对应框的中心点
-    # box_wh对应框的宽和高
+    #---------------------------------------------------#
+    #   将预测值调成真实值
+    #   box_xy对应框的中心点
+    #   box_wh对应框的宽和高
+    #---------------------------------------------------#
     box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats))
     box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats))
     box_confidence = K.sigmoid(feats[..., 4:5])
     box_class_probs = K.sigmoid(feats[..., 5:])
 
-    # 在计算loss的时候返回如下参数
+    #---------------------------------------------------------------------#
+    #   在计算loss的时候返回grid, feats, box_xy, box_wh
+    #   在预测的时候返回box_xy, box_wh, box_confidence, box_class_probs
+    #---------------------------------------------------------------------#
     if calc_loss == True:
         return grid, feats, box_xy, box_wh
     return box_xy, box_wh, box_confidence, box_class_probs
@@ -142,6 +188,9 @@ def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
 #   对box进行调整，使其符合真实图片的样子
 #---------------------------------------------------#
 def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
+    #-----------------------------------------------------------------#
+    #   把y轴放前面是因为方便预测框和图像的宽高进行相乘
+    #-----------------------------------------------------------------#
     box_yx = box_xy[..., ::-1]
     box_hw = box_wh[..., ::-1]
     
@@ -149,6 +198,10 @@ def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
     image_shape = K.cast(image_shape, K.dtype(box_yx))
 
     new_shape = K.round(image_shape * K.min(input_shape/image_shape))
+    #-----------------------------------------------------------------#
+    #   这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
+    #   new_shape指的是宽高缩放情况
+    #-----------------------------------------------------------------#
     offset = (input_shape-new_shape)/2./input_shape
     scale = input_shape/new_shape
 
@@ -171,14 +224,24 @@ def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
 #   获取每个box和它的得分
 #---------------------------------------------------#
 def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape):
-    # 将预测值调成真实值
-    # box_xy对应框的中心点
-    # box_wh对应框的宽和高
-    # -1,13,13,3,2; -1,13,13,3,2; -1,13,13,3,1; -1,13,13,3,80
+    #-----------------------------------------------------------------#
+    #   将预测值调成真实值
+    #   box_xy : -1,13,13,3,2; 
+    #   box_wh : -1,13,13,3,2; 
+    #   box_confidence : -1,13,13,3,1; 
+    #   box_class_probs : -1,13,13,3,80;
+    #-----------------------------------------------------------------#
     box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, anchors, num_classes, input_shape)
-    # 将box_xy、和box_wh调节成y_min,y_max,xmin,xmax
+    #-----------------------------------------------------------------#
+    #   在图像传入网络预测前会进行letterbox_image给图像周围添加灰条
+    #   因此生成的box_xy, box_wh是相对于有灰条的图像的
+    #   我们需要对齐进行修改，去除灰条的部分。
+    #   将box_xy、和box_wh调节成y_min,y_max,xmin,xmax
+    #-----------------------------------------------------------------#
     boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape)
-    # 获得得分和box
+    #-----------------------------------------------------------------#
+    #   获得最终得分和框的位置
+    #-----------------------------------------------------------------#
     boxes = K.reshape(boxes, [-1, 4])
     box_scores = box_confidence * box_class_probs
     box_scores = K.reshape(box_scores, [-1, num_classes])
@@ -194,42 +257,63 @@ def yolo_eval(yolo_outputs,
               max_boxes=20,
               score_threshold=.6,
               iou_threshold=.5):
-    # 获得特征层的数量
+    #---------------------------------------------------#
+    #   获得特征层的数量，有效特征层的数量为3
+    #---------------------------------------------------#
     num_layers = len(yolo_outputs)
-    # 特征层1对应的anchor是678
-    # 特征层2对应的anchor是345
-    # 特征层3对应的anchor是012
+    #-----------------------------------------------------------#
+    #   13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401]
+    #   26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146]
+    #   52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28]
+    #-----------------------------------------------------------#
     anchor_mask = [[6,7,8], [3,4,5], [0,1,2]]
     
+    #-----------------------------------------------------------#
+    #   这里获得的是输入图片的大小，一般是416x416
+    #-----------------------------------------------------------#
     input_shape = K.shape(yolo_outputs[0])[1:3] * 32
     boxes = []
     box_scores = []
-    # 对每个特征层进行处理
+    #-----------------------------------------------------------#
+    #   对每个特征层进行处理
+    #-----------------------------------------------------------#
     for l in range(num_layers):
         _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, image_shape)
         boxes.append(_boxes)
         box_scores.append(_box_scores)
-    # 将每个特征层的结果进行堆叠
+    #-----------------------------------------------------------#
+    #   将每个特征层的结果进行堆叠
+    #-----------------------------------------------------------#
     boxes = K.concatenate(boxes, axis=0)
     box_scores = K.concatenate(box_scores, axis=0)
 
+    #-----------------------------------------------------------#
+    #   判断得分是否大于score_threshold
+    #-----------------------------------------------------------#
     mask = box_scores >= score_threshold
     max_boxes_tensor = K.constant(max_boxes, dtype='int32')
     boxes_ = []
     scores_ = []
     classes_ = []
     for c in range(num_classes):
-        # 取出所有box_scores >= score_threshold的框，和成绩
+        #-----------------------------------------------------------#
+        #   取出所有box_scores >= score_threshold的框，和成绩
+        #-----------------------------------------------------------#
         class_boxes = tf.boolean_mask(boxes, mask[:, c])
         class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c])
 
-        # 非极大抑制，去掉box重合程度高的那一些
+        #-----------------------------------------------------------#
+        #   非极大抑制
+        #   保留一定区域内得分最大的框
+        #-----------------------------------------------------------#
         nms_index = tf.image.non_max_suppression(
             class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold)
 
-        # 获取非极大抑制后的结果
-        # 下列三个分别是
-        # 框的位置，得分与种类
+        #-----------------------------------------------------------#
+        #   获取非极大抑制后的结果
+        #   下列三个分别是
+        #   框的位置，得分与种类
+        #-----------------------------------------------------------#
         class_boxes = K.gather(class_boxes, nms_index)
         class_box_scores = K.gather(class_box_scores, nms_index)
         classes = K.ones_like(class_box_scores, 'int32') * c
diff --git a/predict.py b/predict.py
index 5ff4e03..cbb247c 100644
--- a/predict.py
+++ b/predict.py
@@ -1,6 +1,16 @@
-from yolo import YOLO
+'''
+predict.py有几个注意点
+1、无法进行批量预测，如果想要批量预测，可以利用os.listdir()遍历文件夹，利用Image.open打开图片文件进行预测。
+2、如果想要保存，利用r_image.save("img.jpg")即可保存。
+3、如果想要获得框的坐标，可以进入detect_image函数，读取top,left,bottom,right这四个值。
+4、如果想要截取下目标，可以利用获取到的top,left,bottom,right这四个值在原图上利用矩阵的方式进行截取。
+'''
+from keras.layers import Input
 from PIL import Image
 
+from nets.yolo4 import yolo_body
+from yolo import YOLO
+
 yolo = YOLO()
 
 while True:
diff --git a/test.py b/test.py
index ef3b2ef..f873741 100644
--- a/test.py
+++ b/test.py
@@ -3,11 +3,14 @@
 #   map测试请看get_dr_txt.py、get_gt_txt.py
 #   和get_map.py
 #--------------------------------------------#
-from nets.yolo4 import yolo_body
 from keras.layers import Input
-inputs = Input([416,416,3])
-model = yolo_body(inputs,3,80)
-model.summary()
 
-for i,layer in enumerate(model.layers):
-    print(i,layer.name)
+from nets.yolo4 import yolo_body
+
+if __name__ == "__main__":
+    inputs = Input([416, 416, 3])
+    model = yolo_body(inputs, 3, 80)
+    model.summary()
+
+    # for i,layer in enumerate(model.layers):
+    #     print(i,layer.name)
diff --git a/train.py b/train.py
index 948e1c3..97d7991 100644
--- a/train.py
+++ b/train.py
@@ -1,14 +1,17 @@
+import keras.backend as K
 import numpy as np
 import tensorflow as tf
-import keras.backend as K
+from keras.backend.tensorflow_backend import set_session
+from keras.callbacks import (EarlyStopping, ModelCheckpoint, ReduceLROnPlateau,
+                             TensorBoard)
 from keras.layers import Input, Lambda
 from keras.models import Model
 from keras.optimizers import Adam
-from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
-from nets.yolo4 import yolo_body
+
 from nets.loss import yolo_loss
-from keras.backend.tensorflow_backend import set_session
-from utils.utils import get_random_data,get_random_data_with_Mosaic,rand,WarmUpCosineDecayScheduler
+from nets.yolo4 import yolo_body
+from utils.utils import (WarmUpCosineDecayScheduler, get_random_data,
+                         get_random_data_with_Mosaic, rand)
 
 
 #---------------------------------------------------#
@@ -31,8 +34,7 @@ def get_anchors(anchors_path):
 #---------------------------------------------------#
 #   训练数据生成器
 #---------------------------------------------------#
-def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, mosaic=False):
-    '''data generator for fit_generator'''
+def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, mosaic=False, random=True):
     n = len(annotation_lines)
     i = 0
     flag = True
@@ -47,11 +49,11 @@ def data_generator(annotation_lines, batch_size, input_shape, anchors, num_class
                     image, box = get_random_data_with_Mosaic(annotation_lines[i:i+4], input_shape)
                     i = (i+1) % n
                 else:
-                    image, box = get_random_data(annotation_lines[i], input_shape)
+                    image, box = get_random_data(annotation_lines[i], input_shape, random=random)
                     i = (i+1) % n
                 flag = bool(1-flag)
             else:
-                image, box = get_random_data(annotation_lines[i], input_shape)
+                image, box = get_random_data(annotation_lines[i], input_shape, random=random)
                 i = (i+1) % n
             image_data.append(image)
             box_data.append(box)
@@ -60,7 +62,6 @@ def data_generator(annotation_lines, batch_size, input_shape, anchors, num_class
         y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
         yield [image_data, *y_true], np.zeros(batch_size)
 
-
 #---------------------------------------------------#
 #   读入xml文件，并输出y_true
 #---------------------------------------------------#
@@ -68,80 +69,130 @@ def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes):
     assert (true_boxes[..., 4]<num_classes).all(), 'class id must be less than num_classes'
     # 一共有三个特征层数
     num_layers = len(anchors)//3
-    # 先验框
-    # 678为 142,110,  192,243,  459,401
-    # 345为 36,75,  76,55,  72,146
-    # 012为 12,16,  19,36,  40,28
-    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
+    #-----------------------------------------------------------#
+    #   13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401]
+    #   26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146]
+    #   52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28]
+    #-----------------------------------------------------------#
+    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]]
 
+    #-----------------------------------------------------------#
+    #   获得框的坐标和图片的大小
+    #-----------------------------------------------------------#
     true_boxes = np.array(true_boxes, dtype='float32')
-    input_shape = np.array(input_shape, dtype='int32') # 416,416
-    # 读出xy轴，读出长宽
-    # 中心点(m,n,2)
+    input_shape = np.array(input_shape, dtype='int32')
+    #-----------------------------------------------------------#
+    #   通过计算获得真实框的中心和宽高
+    #   中心点(m,n,2) 宽高(m,n,2)
+    #-----------------------------------------------------------#
     boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
     boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]
-    # 计算比例
+    #-----------------------------------------------------------#
+    #   将真实框归一化到小数形式
+    #-----------------------------------------------------------#
     true_boxes[..., 0:2] = boxes_xy/input_shape[::-1]
     true_boxes[..., 2:4] = boxes_wh/input_shape[::-1]
 
-    # m张图
+    # m为图片数量，grid_shapes为网格的shape
     m = true_boxes.shape[0]
-    # 得到网格的shape为13,13;26,26;52,52
     grid_shapes = [input_shape//{0:32, 1:16, 2:8}[l] for l in range(num_layers)]
-    # y_true的格式为(m,13,13,3,85)(m,26,26,3,85)(m,52,52,3,85)
+    #-----------------------------------------------------------#
+    #   y_true的格式为(m,13,13,3,85)(m,26,26,3,85)(m,52,52,3,85)
+    #-----------------------------------------------------------#
     y_true = [np.zeros((m,grid_shapes[l][0],grid_shapes[l][1],len(anchor_mask[l]),5+num_classes),
         dtype='float32') for l in range(num_layers)]
-    # [1,9,2]
+
+    #-----------------------------------------------------------#
+    #   [9,2] -> [1,9,2]
+    #-----------------------------------------------------------#
     anchors = np.expand_dims(anchors, 0)
     anchor_maxes = anchors / 2.
     anchor_mins = -anchor_maxes
-    # 长宽要大于0才有效
+
+    #-----------------------------------------------------------#
+    #   长宽要大于0才有效
+    #-----------------------------------------------------------#
     valid_mask = boxes_wh[..., 0]>0
 
     for b in range(m):
         # 对每一张图进行处理
         wh = boxes_wh[b, valid_mask[b]]
         if len(wh)==0: continue
-        # [n,1,2]
+        #-----------------------------------------------------------#
+        #   [n,2] -> [n,1,2]
+        #-----------------------------------------------------------#
         wh = np.expand_dims(wh, -2)
         box_maxes = wh / 2.
         box_mins = -box_maxes
 
-        # 计算真实框和哪个先验框最契合
+        #-----------------------------------------------------------#
+        #   计算所有真实框和先验框的交并比
+        #   intersect_area  [n,9]
+        #   box_area        [n,1]
+        #   anchor_area     [1,9]
+        #   iou             [n,9]
+        #-----------------------------------------------------------#
         intersect_mins = np.maximum(box_mins, anchor_mins)
         intersect_maxes = np.minimum(box_maxes, anchor_maxes)
         intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
         intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+
         box_area = wh[..., 0] * wh[..., 1]
         anchor_area = anchors[..., 0] * anchors[..., 1]
+
         iou = intersect_area / (box_area + anchor_area - intersect_area)
-        # 维度是(n) 感谢 消尽不死鸟 的提醒
+        #-----------------------------------------------------------#
+        #   维度是[n,] 感谢 消尽不死鸟 的提醒
+        #-----------------------------------------------------------#
         best_anchor = np.argmax(iou, axis=-1)
 
         for t, n in enumerate(best_anchor):
+            #-----------------------------------------------------------#
+            #   找到每个真实框所属的特征层
+            #-----------------------------------------------------------#
             for l in range(num_layers):
                 if n in anchor_mask[l]:
-                    # floor用于向下取整
-                    i = np.floor(true_boxes[b,t,0]*grid_shapes[l][1]).astype('int32')
-                    j = np.floor(true_boxes[b,t,1]*grid_shapes[l][0]).astype('int32')
-                    # 找到真实框在特征层l中第b副图像对应的位置
+                    #-----------------------------------------------------------#
+                    #   floor用于向下取整，找到真实框所属的特征层对应的x、y轴坐标
+                    #-----------------------------------------------------------#
+                    i = np.floor(true_boxes[b,t,0] * grid_shapes[l][1]).astype('int32')
+                    j = np.floor(true_boxes[b,t,1] * grid_shapes[l][0]).astype('int32')
+                    #-----------------------------------------------------------#
+                    #   k指的的当前这个特征点的第k个先验框
+                    #-----------------------------------------------------------#
                     k = anchor_mask[l].index(n)
-                    c = true_boxes[b,t, 4].astype('int32')
-                    y_true[l][b, j, i, k, 0:4] = true_boxes[b,t, 0:4]
+                    #-----------------------------------------------------------#
+                    #   c指的是当前这个真实框的种类
+                    #-----------------------------------------------------------#
+                    c = true_boxes[b, t, 4].astype('int32')
+                    #-----------------------------------------------------------#
+                    #   y_true的shape为(m,13,13,3,85)(m,26,26,3,85)(m,52,52,3,85)
+                    #   最后的85可以拆分成4+1+80，4代表的是框的中心与宽高、
+                    #   1代表的是置信度、80代表的是种类
+                    #-----------------------------------------------------------#
+                    y_true[l][b, j, i, k, 0:4] = true_boxes[b, t, 0:4]
                     y_true[l][b, j, i, k, 4] = 1
                     y_true[l][b, j, i, k, 5+c] = 1
 
     return y_true
 
-
 #----------------------------------------------------#
 #   检测精度mAP和pr曲线计算参考视频
 #   https://www.bilibili.com/video/BV1zE411u7Vw
 #----------------------------------------------------#
 if __name__ == "__main__":
-    # 标签的位置
+    #----------------------------------------------------#
+    #   获得图片路径和标签
+    #----------------------------------------------------#
     annotation_path = '2007_train.txt'
-    # 获取classes和anchor的位置
+    #------------------------------------------------------#
+    #   训练后的模型保存的位置，保存在logs文件夹里面
+    #------------------------------------------------------#
+    log_dir = 'logs/'
+    #----------------------------------------------------#
+    #   classes和anchor的路径，非常重要
+    #   训练前一定要修改classes_path，使其对应自己的数据集
+    #----------------------------------------------------#
     classes_path = 'model_data/voc_classes.txt'    
     anchors_path = 'model_data/yolo_anchors.txt'
     #------------------------------------------------------#
@@ -150,58 +201,81 @@ if __name__ == "__main__":
     #   预测的东西都不一样了自然维度不匹配
     #------------------------------------------------------#
     weights_path = 'model_data/yolo4_weight.h5'
-    # 获得classes和anchor
+    #------------------------------------------------------#
+    #   训练用图片大小
+    #   一般在416x416和608x608选择
+    #------------------------------------------------------#
+    input_shape = (416,416)
+    #------------------------------------------------------#
+    #   是否对损失进行归一化
+    #------------------------------------------------------#
+    normalize = True
+
+    #----------------------------------------------------#
+    #   获取classes和anchor
+    #----------------------------------------------------#
     class_names = get_classes(classes_path)
     anchors = get_anchors(anchors_path)
-    # 一共有多少类
+    #------------------------------------------------------#
+    #   一共有多少类和多少先验框
+    #------------------------------------------------------#
     num_classes = len(class_names)
     num_anchors = len(anchors)
-    # 训练后的模型保存的位置
-    log_dir = 'logs/'
-    # 输入的shape大小
-    # 显存比较小可以使用416x416
-    # 现存比较大可以使用608x608
-    input_shape = (416,416)
+    #------------------------------------------------------#
+    #   Yolov4的tricks应用
+    #   mosaic 马赛克数据增强 True or False
+    #   Cosine_scheduler 余弦退火学习率 True or False
+    #   label_smoothing 标签平滑 0.01以下一般 如0.01、0.005
+    #------------------------------------------------------#
     mosaic = True
     Cosine_scheduler = False
     label_smoothing = 0
 
-    # 清除session
     K.clear_session()
-
-    # 输入的图像为
+    #------------------------------------------------------#
+    #   创建yolo模型
+    #------------------------------------------------------#
     image_input = Input(shape=(None, None, 3))
     h, w = input_shape
-
-    # 创建yolo模型
     print('Create YOLOv4 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
     model_body = yolo_body(image_input, num_anchors//3, num_classes)
     
-    # 载入预训练权重
+    #------------------------------------------------------#
+    #   载入预训练权重
+    #------------------------------------------------------#
     print('Load weights {}.'.format(weights_path))
     model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
     
-    # y_true为13,13,3,85
-    # 26,26,3,85
-    # 52,52,3,85
+    #------------------------------------------------------#
+    #   在这个地方设置损失，将网络的输出结果传入loss函数
+    #   把整个模型的输出作为loss
+    #------------------------------------------------------#
     y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
         num_anchors//3, num_classes+5)) for l in range(3)]
-
-    # 输入为*model_body.input, *y_true
-    # 输出为model_loss
     loss_input = [*model_body.output, *y_true]
     model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
-        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5, 'label_smoothing': label_smoothing})(loss_input)
+        arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5, 
+            'label_smoothing': label_smoothing, 'normalize': normalize})(loss_input)
 
     model = Model([model_body.input, *y_true], model_loss)
 
-    # 训练参数设置
+    #-------------------------------------------------------------------------------#
+    #   训练参数的设置
+    #   logging表示tensorboard的保存地址
+    #   checkpoint用于设置权值保存的细节，period用于修改多少epoch保存一次
+    #   reduce_lr用于设置学习率下降的方式
+    #   early_stopping用于设定早停，val_loss多次不下降自动结束训练，表示模型基本收敛
+    #-------------------------------------------------------------------------------#
     logging = TensorBoard(log_dir=log_dir)
     checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
         monitor='val_loss', save_weights_only=True, save_best_only=False, period=1)
     early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
 
-    # 0.1用于验证，0.9用于训练
+    #----------------------------------------------------------------------#
+    #   验证集的划分在train.py代码里面进行
+    #   2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。
+    #   当前划分方式下，验证集和训练集的比例为1:9
+    #----------------------------------------------------------------------#
     val_split = 0.1
     with open(annotation_path) as f:
         lines = f.readlines()
@@ -211,6 +285,10 @@ if __name__ == "__main__":
     num_val = int(len(lines)*val_split)
     num_train = len(lines) - num_val
     
+    freeze_layers = 249
+    for i in range(freeze_layers): model_body.layers[i].trainable = False
+    print('Freeze the first {} layers of total {} layers.'.format(freeze_layers, len(model_body.layers)))
+
     #------------------------------------------------------#
     #   主干特征提取网络特征通用，冻结训练可以加快训练速度
     #   也可以在训练初期防止权值被破坏。
@@ -219,18 +297,12 @@ if __name__ == "__main__":
     #   Epoch总训练世代
     #   提示OOM或者显存不足请调小Batch_size
     #------------------------------------------------------#
-    freeze_layers = 249
-    for i in range(freeze_layers): model_body.layers[i].trainable = False
-    print('Freeze the first {} layers of total {} layers.'.format(freeze_layers, len(model_body.layers)))
-
-    # 调整非主干模型first
     if True:
         Init_epoch = 0
         Freeze_epoch = 50
-        # batch_size大小，每次喂入多少数据
         batch_size = 8
-        # 最大学习率
         learning_rate_base = 1e-3
+
         if Cosine_scheduler:
             # 预热期
             warmup_epoch = int((Freeze_epoch-Init_epoch)*0.2)
@@ -252,9 +324,9 @@ if __name__ == "__main__":
             model.compile(optimizer=Adam(learning_rate_base), loss={'yolo_loss': lambda y_true, y_pred: y_pred})
 
         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
-        model.fit_generator(data_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, mosaic=mosaic),
+        model.fit_generator(data_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, mosaic=mosaic, random=True),
                 steps_per_epoch=max(1, num_train//batch_size),
-                validation_data=data_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, mosaic=False),
+                validation_data=data_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, mosaic=False, random=False),
                 validation_steps=max(1, num_val//batch_size),
                 epochs=Freeze_epoch,
                 initial_epoch=Init_epoch,
@@ -263,15 +335,12 @@ if __name__ == "__main__":
 
     for i in range(freeze_layers): model_body.layers[i].trainable = True
 
-    # 解冻后训练
     if True:
         Freeze_epoch = 50
         Epoch = 100
-        # batch_size大小，每次喂入多少数据
         batch_size = 2
-
-        # 最大学习率
         learning_rate_base = 1e-4
+
         if Cosine_scheduler:
             # 预热期
             warmup_epoch = int((Epoch-Freeze_epoch)*0.2)
@@ -293,9 +362,9 @@ if __name__ == "__main__":
             model.compile(optimizer=Adam(learning_rate_base), loss={'yolo_loss': lambda y_true, y_pred: y_pred})
 
         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
-        model.fit_generator(data_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, mosaic=mosaic),
+        model.fit_generator(data_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, mosaic=mosaic, random=True),
                 steps_per_epoch=max(1, num_train//batch_size),
-                validation_data=data_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, mosaic=False),
+                validation_data=data_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, mosaic=False, random=False),
                 validation_steps=max(1, num_val//batch_size),
                 epochs=Epoch,
                 initial_epoch=Freeze_epoch,
diff --git a/utils/utils.py b/utils/utils.py
index ed334a4..6be23a8 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -1,12 +1,12 @@
-"""Miscellaneous utility functions."""
+from functools import reduce
 
-import numpy as np
+import cv2
 import keras
 import keras.backend as K
-from functools import reduce
+import numpy as np
+from matplotlib.colors import hsv_to_rgb, rgb_to_hsv
 from PIL import Image
-from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
-import cv2
+
 
 def compose(*funcs):
     if funcs:
@@ -101,8 +101,8 @@ def merge_bboxes(bboxes, cutx, cuty):
 def get_random_data_with_Mosaic(annotation_line, input_shape, max_boxes=100, hue=.1, sat=1.5, val=1.5):
     '''random preprocessing for real-time data augmentation'''
     h, w = input_shape
-    min_offset_x = 0.4
-    min_offset_y = 0.4
+    min_offset_x = 0.3
+    min_offset_y = 0.3
     scale_low = 1-min(min_offset_x,min_offset_y)
     scale_high = scale_low+0.2
 
@@ -112,6 +112,7 @@ def get_random_data_with_Mosaic(annotation_line, input_shape, max_boxes=100, hue
 
     place_x = [0,0,int(w*min_offset_x),int(w*min_offset_x)]
     place_y = [0,int(h*min_offset_y),int(h*min_offset_y),0]
+
     for line in annotation_line:
         # 每一行进行分割
         line_content = line.split()
@@ -163,7 +164,6 @@ def get_random_data_with_Mosaic(annotation_line, input_shape, max_boxes=100, hue
         new_image.paste(image, (dx, dy))
         image_data = np.array(new_image)/255
 
-        
         index = index + 1
         box_data = []
         # 对box进行重新处理
@@ -183,8 +183,6 @@ def get_random_data_with_Mosaic(annotation_line, input_shape, max_boxes=100, hue
         image_datas.append(image_data)
         box_datas.append(box_data)
 
-
-    
     # 将图片分割，放在一起
     cutx = np.random.randint(int(w*min_offset_x), int(w*(1 - min_offset_x)))
     cuty = np.random.randint(int(h*min_offset_y), int(h*(1 - min_offset_y)))
@@ -206,7 +204,7 @@ def get_random_data_with_Mosaic(annotation_line, input_shape, max_boxes=100, hue
     return new_image, box_data
 
 
-def get_random_data(annotation_line, input_shape, max_boxes=100, jitter=.3, hue=.1, sat=1.5, val=1.5):
+def get_random_data(annotation_line, input_shape, max_boxes=100, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):
     '''random preprocessing for real-time data augmentation'''
     line = annotation_line.split()
     image = Image.open(line[0])
@@ -214,6 +212,36 @@ def get_random_data(annotation_line, input_shape, max_boxes=100, jitter=.3, hue=
     h, w = input_shape
     box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
 
+    if not random:
+        # resize image
+        scale = min(w/iw, h/ih)
+        nw = int(iw*scale)
+        nh = int(ih*scale)
+        dx = (w-nw)//2
+        dy = (h-nh)//2
+
+        image = image.resize((nw,nh), Image.BICUBIC)
+        new_image = Image.new('RGB', (w,h), (128,128,128))
+        new_image.paste(image, (dx, dy))
+        image_data = np.array(new_image, np.float32)/255
+
+        # correct boxes
+        box_data = np.zeros((max_boxes,5))
+        if len(box)>0:
+            np.random.shuffle(box)
+            box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
+            box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
+            box[:, 0:2][box[:, 0:2]<0] = 0
+            box[:, 2][box[:, 2]>w] = w
+            box[:, 3][box[:, 3]>h] = h
+            box_w = box[:, 2] - box[:, 0]
+            box_h = box[:, 3] - box[:, 1]
+            box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
+            if len(box)>max_boxes: box = box[:max_boxes]
+            box_data[:len(box)] = box
+
+        return image_data, box_data
+        
     # 对图像进行缩放并且进行长和宽的扭曲
     new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
     scale = rand(.25, 2)
diff --git a/video.py b/video.py
index cf64691..6488b3d 100644
--- a/video.py
+++ b/video.py
@@ -1,15 +1,24 @@
 #-------------------------------------#
-#       调用摄像头检测
+#   调用摄像头或者视频进行检测
+#   调用摄像头直接运行即可
+#   调用视频可以将cv2.VideoCapture()指定路径
+#   视频的保存并不难，可以百度一下看看
 #-------------------------------------#
+import time
+
+import cv2
+import numpy as np
 from keras.layers import Input
-from yolo import YOLO
 from PIL import Image
-import numpy as np
-import cv2
-import time
+
+from yolo import YOLO
+
 yolo = YOLO()
-# 调用摄像头
-capture=cv2.VideoCapture(0) # capture=cv2.VideoCapture("1.mp4")
+#-------------------------------------#
+#   调用摄像头
+#   capture=cv2.VideoCapture("1.mp4")
+#-------------------------------------#
+capture=cv2.VideoCapture(0)
 
 fps = 0.0
 while(True):
@@ -20,10 +29,8 @@ while(True):
     frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
     # 转变成Image
     frame = Image.fromarray(np.uint8(frame))
-
     # 进行检测
     frame = np.array(yolo.detect_image(frame))
-
     # RGBtoBGR满足opencv显示格式
     frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR)
     
@@ -37,4 +44,5 @@ while(True):
         capture.release()
         break
 
-yolo.close_session()    
+yolo.close_session()
+    
diff --git a/vision_for_anchors.py b/vision_for_anchors.py
index 894f668..ceb5bab 100644
--- a/vision_for_anchors.py
+++ b/vision_for_anchors.py
@@ -1,5 +1,7 @@
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
+
+
 def sigmoid(x):
     s = 1 / (1 + np.exp(-x))
     return s
@@ -78,4 +80,4 @@ def yolo_head(feats, anchors, num_classes):
     #
 feat = np.random.normal(0,0.5,[4,13,13,75])
 anchors = [[142, 110],[192, 243],[459, 401]]
-yolo_head(feat,anchors,20)
\ No newline at end of file
+yolo_head(feat,anchors,20)
diff --git a/voc_annotation.py b/voc_annotation.py
index aad0657..ca0f88d 100644
--- a/voc_annotation.py
+++ b/voc_annotation.py
@@ -1,3 +1,8 @@
+#---------------------------------------------#
+#   运行前一定要修改classes
+#   如果生成的2007_train.txt里面没有目标信息
+#   那么就是因为classes没有设定正确
+#---------------------------------------------#
 import xml.etree.ElementTree as ET
 from os import getcwd
 
diff --git a/yolo.py b/yolo.py
index 0ac6e1e..f688547 100644
--- a/yolo.py
+++ b/yolo.py
@@ -1,17 +1,23 @@
-import os
-import numpy as np
-import copy
 import colorsys
+import copy
+import os
 from timeit import default_timer as timer
+
+import numpy as np
 from keras import backend as K
-from keras.models import load_model
 from keras.layers import Input
-from PIL import Image, ImageFont, ImageDraw
-from nets.yolo4 import yolo_body,yolo_eval
+from keras.models import load_model
+from PIL import Image, ImageDraw, ImageFont
+
+from nets.yolo4 import yolo_body, yolo_eval
 from utils.utils import letterbox_image
+
+
 #--------------------------------------------#
 #   使用自己训练好的模型预测需要修改2个参数
 #   model_path和classes_path都需要修改！
+#   如果出现shape不匹配，一定要注意
+#   训练时的model_path和classes_path参数的修改
 #--------------------------------------------#
 class YOLO(object):
     _defaults = {
@@ -64,18 +70,22 @@ class YOLO(object):
         return np.array(anchors).reshape(-1, 2)
 
     #---------------------------------------------------#
-    #   获得所有的分类
+    #   载入模型
     #---------------------------------------------------#
     def generate(self):
         model_path = os.path.expanduser(self.model_path)
         assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
         
-        # 计算anchor数量
+        #---------------------------------------------------#
+        #   计算先验框的数量和种类的数量
+        #---------------------------------------------------#
         num_anchors = len(self.anchors)
         num_classes = len(self.class_names)
 
-        # 载入模型，如果原来的模型里已经包括了模型结构则直接载入。
-        # 否则先构建模型再载入
+        #---------------------------------------------------------#
+        #   载入模型，如果原来的模型里已经包括了模型结构则直接载入。
+        #   否则先构建模型再载入
+        #---------------------------------------------------------#
         try:
             self.yolo_model = load_model(model_path, compile=False)
         except:
@@ -103,6 +113,10 @@ class YOLO(object):
 
         self.input_image_shape = K.placeholder(shape=(2, ))
 
+        #---------------------------------------------------------#
+        #   在yolo_eval函数中，我们会对预测结果进行后处理
+        #   后处理的内容包括，解码、非极大抑制、门限筛选等
+        #---------------------------------------------------------#
         boxes, scores, classes = yolo_eval(self.yolo_model.output, self.anchors,
                 num_classes, self.input_image_shape, max_boxes = self.max_boxes,
                 score_threshold = self.score, iou_threshold = self.iou)
@@ -113,30 +127,37 @@ class YOLO(object):
     #---------------------------------------------------#
     def detect_image(self, image):
         start = timer()
-
-        # 调整图片使其符合输入要求
+        #---------------------------------------------------------#
+        #   给图像增加灰条，实现不失真的resize
+        #---------------------------------------------------------#
         new_image_size = (self.model_image_size[1],self.model_image_size[0])
         boxed_image = letterbox_image(image, new_image_size)
         image_data = np.array(boxed_image, dtype='float32')
         image_data /= 255.
-        image_data = np.expand_dims(image_data, 0)  # Add batch dimension.
-
-        # 预测结果
+        #---------------------------------------------------------#
+        #   添加上batch_size维度
+        #---------------------------------------------------------#
+        image_data = np.expand_dims(image_data, 0)
+
+        #---------------------------------------------------------#
+        #   将图像输入网络当中进行预测！
+        #---------------------------------------------------------#
         out_boxes, out_scores, out_classes = self.sess.run(
             [self.boxes, self.scores, self.classes],
             feed_dict={
                 self.yolo_model.input: image_data,
                 self.input_image_shape: [image.size[1], image.size[0]],
-                K.learning_phase(): 0
-            })
+                K.learning_phase(): 0})
 
         print('Found {} boxes for {}'.format(len(out_boxes), 'img'))
-        # 设置字体
+
+        #---------------------------------------------------------#
+        #   设置字体
+        #---------------------------------------------------------#
         font = ImageFont.truetype(font='font/simhei.ttf',
                     size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
-        thickness = (image.size[0] + image.size[1]) // 300
 
-        small_pic=[]
+        thickness = max((image.size[0] + image.size[1]) // 300, 1)
 
         for i, c in list(enumerate(out_classes)):
             predicted_class = self.class_names[c]
@@ -159,7 +180,7 @@ class YOLO(object):
             draw = ImageDraw.Draw(image)
             label_size = draw.textsize(label, font)
             label = label.encode('utf-8')
-            print(label)
+            print(label, top, left, bottom, right)
             
             if top - label_size[1] >= 0:
                 text_origin = np.array([left, top - label_size[1]])
-- 
GitLab