diff --git a/ssd/README.md b/ssd/README.md index c5bd68557280f9254c53627daf58291e7077b973..46df96332e0b8886aa6ce0d185d2001df065f817 100644 --- a/ssd/README.md +++ b/ssd/README.md @@ -6,7 +6,7 @@ SSD使用一个卷积神经网络实现“端到端”的检测,所谓“端 1. 将最后的fc6、fc7全连接层变为卷积层,卷积层参数通过对原始fc6、fc7参数采样得到。 2. 将pool5层的参数由2x2-s2(kernel大小为2x2,stride size为2)更改为3x3-s1-p1(kernel大小为3x3,stride size为1,padding size为1)。 -3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层,priorbox层的主要目的是根据输入的feature map生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。 +3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层,priorbox层的主要目的是根据输入的特征图(feature map)生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。 下图为模型(300x300)的总体结构: @@ -17,12 +17,12 @@ SSD使用一个卷积神经网络实现“端到端”的检测,所谓“端 图中每个矩形盒子代表一个卷积层,最后的两个矩形框分别表示汇总各卷积层输出结果和后处理阶段。具体地,在预测阶段网络会输出一组候选矩形框,每个矩形包含两类信息:位置和类别得分,图中倒数第二个矩形框即表示网络的检测结果的汇总处理,由于候选矩形框数量较多且很多矩形框重叠严重,这时需要经过后处理来筛选出质量较高的少数矩形框,这里的后处理主要指非极大值抑制(Non-maximum Suppression)。 -从SSD的网络结构可以看出,候选矩形框在多个feature map上生成,不同的feature map具有的感受野不同,这样可以在不同尺度扫描图像,相对于其他检测方法可以生成更丰富的候选框,从而提高检测精度;另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算,整个过程只需要一个卷积神经网络完成,所以速度较快。 +从SSD的网络结构可以看出,候选矩形框在多个特征图(feature map上)生成,不同的feature map具有的感受野不同,这样可以在不同尺度扫描图像,相对于其他检测方法可以生成更丰富的候选框,从而提高检测精度;另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算,整个过程只需要一个卷积神经网络完成,所以速度较快。 ## 示例总览 本示例共包含如下文件:
-
表1. 示例文件
+表1. 示例文件 文件 | 用途 ---- | ----- @@ -63,7 +63,7 @@ def prepare_filelist(devkit_dir, years, output_dir): ftest.write(item[0] + ' ' + item[1] + '\n') ``` -该函数首先对每个year的数据进行处理,然后将训练图像的文件路径列表进行随机打乱,最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下,且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据,也包含VOC2012的训练数据,```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例: +该函数首先对每一年(year)的数据进行处理,然后将训练图像的文件路径列表进行随机打乱,最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下,且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据,也包含VOC2012的训练数据,```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例: ``` VOCdevkit/VOC2007/JPEGImages/000005.jpg VOCdevkit/VOC2007/Annotations/000005.xml @@ -99,6 +99,15 @@ train(train_file_list='./data/trainval.txt', 3. 调用```train```执行训练,其中```train_file_list```指定训练数据列表,```dev_file_list```指定评估数据列表,```init_model_path```指定预训练模型位置。 4. 训练过程中会打印一些日志信息,每训练10个batch会输出当前的轮数、当前batch的cost及mAP(mean Average Precision,平均精度均值),每训练一个pass,会保存一次模型,默认保存在```checkpoints```目录下(注:需事先创建)。 +下面给出SDD300x300在VOC数据集(train包括07+12,test为07)上的mAP曲线,迭代140轮mAP可达到71.52%。 + +

+
+ +图2. SSD300x300 mAP收敛曲线 +

+ + ### 模型评估 执行```python eval.py```即可对模型进行评估,```eval.py```的关键执行逻辑如下: @@ -134,7 +143,28 @@ infer( threshold=0.3) ``` -其中```eval_file_list```指定图像路径列表;```save_path```指定预测结果保存路径;```data_args```如上;```batch_size```为每多少样本预测一次;```model_path```指模型的位置;```threshold```为置信度阈值,只有得分大于或等于该值的才会输出。示例还提供了一个可视化脚本,直接运行```python visual.py```即可,须指定输出检测结果路径及输出目录。 +其中```eval_file_list```指定图像路径列表;```save_path```指定预测结果保存路径;```data_args```如上;```batch_size```为每多少样本预测一次;```model_path```指模型的位置;```threshold```为置信度阈值,只有得分大于或等于该值的才会输出。下面给出```infer.res```的一些输出样例: + +``` +VOCdevkit/VOC2007/JPEGImages/006936.jpg 12 0.997844 131.255611777 162.271582842 396.475315094 334.0 +VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.998557 229.160234332 49.5991278887 314.098775387 312.913876176 +VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.372522 187.543615699 133.727034628 345.647156239 327.448492289 +... +``` + +一共包含4个字段,以tab分割,第一个字段是检测图像路径,第二字段为检测矩形框内类别,第三个字段是置信度,第四个字段是4个坐标值(以空格分割)。 + +示例还提供了一个可视化脚本,直接运行```python visual.py```即可,须指定输出检测结果路径及输出目录,默认可视化后图像保存在```./visual_res```,下面是用训练好的模型infer部分图像,并可视化的效果: + +

+ + + +
+ +图2. SSD300x300 检测可视化示例 +

+ ## 自有数据集 在自有数据上训练PaddlePaddle SSD需要完成两个关键准备,首先需要适配网络可以接受的输入格式,这里提供一个推荐的结构,以```train.txt```为例 diff --git a/ssd/data/prepare_voc_data.py b/ssd/data/prepare_voc_data.py index a55b232de1ff50fc70ec65dc1412fcaff7601666..a652956e91ab8277bc6670d4dc85905fc52a3203 100644 --- a/ssd/data/prepare_voc_data.py +++ b/ssd/data/prepare_voc_data.py @@ -7,22 +7,14 @@ devkit_dir = './VOCdevkit' years = ['2007', '2012'] -def get_img_dir(devkit_dir, year): - return osp.join(devkit_dir, 'VOC' + year, 'JPEGImages') - - -def get_annotation_dir(devkit_dir, year): - return osp.join(devkit_dir, 'VOC' + year, 'Annotations') - - -def get_filelist_dir(devkit_dir, year): - return osp.join(devkit_dir, 'VOC' + year, 'ImageSets/Main') +def get_dir(devkit_dir, year, type): + return osp.join(devkit_dir, 'VOC' + year, type) def walk_dir(devkit_dir, year): - filelist_dir = get_filelist_dir(devkit_dir, year) - annotation_dir = get_annotation_dir(devkit_dir, year) - img_dir = get_img_dir(devkit_dir, year) + filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main') + annotation_dir = get_dir(devkit_dir, year, 'Annotations') + img_dir = get_dir(devkit_dir, year, 'JPEGImages') trainval_list = [] test_list = [] added = set() diff --git a/ssd/data_provider.py b/ssd/data_provider.py index c01eb0819b9b29c4c5f979cf5052b056ecd91002..e59d324b497977ec02c1f728cb49a432f864382c 100644 --- a/ssd/data_provider.py +++ b/ssd/data_provider.py @@ -31,15 +31,8 @@ class Settings(object): self._resize_height = resize_h self._resize_width = resize_w - self._mean_value = mean_value - - img_size = self._resize_height * self._resize_width - self._img_mean = np.zeros(img_size * 3, dtype=np.single) - for idx, value in enumerate(self._mean_value): - self._img_mean[idx * img_size:(idx + 1) * img_size] = value - self._img_mean = self._img_mean.reshape(3, self._resize_height, - self._resize_width) - self._img_mean = self._img_mean.astype('float32') + self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype( + 'float32') @property def data_dir(self): @@ -130,12 +123,12 @@ def _reader_creator(settings, file_list, mode, shuffle): image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0)) """ random crop """ - sampled_bbox = image_util.generateBatchSamples( + sampled_bbox = image_util.generate_batch_samples( batch_sampler, bbox_labels, img_width, img_height) if len(sampled_bbox) > 0: idx = int(random.uniform(0, len(sampled_bbox))) - img, sample_labels = image_util.cropImage( + img, sample_labels = image_util.crop_image( img, bbox_labels, sampled_bbox[idx], img_width, img_height) diff --git a/ssd/image_util.py b/ssd/image_util.py index 9997fd5589dd53b67e2d2279f296aa22ce56df83..668d3f838203bf31c1bdab036e2cd190cfb014c9 100644 --- a/ssd/image_util.py +++ b/ssd/image_util.py @@ -40,20 +40,13 @@ class bbox(): self.ymax = ymax -def bboxSize(src_bbox): +def bbox_area(src_bbox): width = src_bbox.xmax - src_bbox.xmin height = src_bbox.ymax - src_bbox.ymin return width * height -def preprocessImg(obj, im): - im = im.astype('float32') - pic = im - pic -= obj.img_mean - return pic.flatten() - - -def generateSample(sampler): +def generate_sample(sampler): scale = random.uniform(sampler.min_scale, sampler.max_scale) min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0)) max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0)) @@ -70,7 +63,7 @@ def generateSample(sampler): return sampled_bbox -def jaccardOverlap(sample_bbox, object_bbox): +def jaccard_overlap(sample_bbox, object_bbox): if sample_bbox.xmin >= object_bbox.xmax or \ sample_bbox.xmax <= object_bbox.xmin or \ sample_bbox.ymin >= object_bbox.ymax or \ @@ -82,20 +75,20 @@ def jaccardOverlap(sample_bbox, object_bbox): intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax) intersect_size = (intersect_xmax - intersect_xmin) * ( intersect_ymax - intersect_ymin) - sample_bbox_size = bboxSize(sample_bbox) - object_bbox_size = bboxSize(object_bbox) + sample_bbox_size = bbox_area(sample_bbox) + object_bbox_size = bbox_area(object_bbox) overlap = intersect_size / ( sample_bbox_size + object_bbox_size - intersect_size) return overlap -def satisfySampleConstraint(sampler, sample_bbox, bbox_labels): +def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0: return True for i in range(len(bbox_labels)): object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], bbox_labels[i][3], bbox_labels[i][4]) - overlap = jaccardOverlap(sample_bbox, object_bbox) + overlap = jaccard_overlap(sample_bbox, object_bbox) if sampler.min_jaccard_overlap != 0 and \ overlap < sampler.min_jaccard_overlap: continue @@ -106,7 +99,8 @@ def satisfySampleConstraint(sampler, sample_bbox, bbox_labels): return False -def generateBatchSamples(batch_sampler, bbox_labels, image_width, image_height): +def generate_batch_samples(batch_sampler, bbox_labels, image_width, + image_height): sampled_bbox = [] index = [] c = 0 @@ -115,8 +109,8 @@ def generateBatchSamples(batch_sampler, bbox_labels, image_width, image_height): for i in range(sampler.max_trial): if found >= sampler.max_sample: break - sample_bbox = generateSample(sampler) - if satisfySampleConstraint(sampler, sample_bbox, bbox_labels): + sample_bbox = generate_sample(sampler) + if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): sampled_bbox.append(sample_bbox) found = found + 1 index.append(c) @@ -124,7 +118,7 @@ def generateBatchSamples(batch_sampler, bbox_labels, image_width, image_height): return sampled_bbox -def clipBBox(src_bbox): +def clip_bbox(src_bbox): src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0) src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0) src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0) @@ -132,7 +126,7 @@ def clipBBox(src_bbox): return src_bbox -def meetEmitConstraint(src_bbox, sample_bbox): +def meet_emit_constraint(src_bbox, sample_bbox): center_x = (src_bbox.xmax + src_bbox.xmin) / 2 center_y = (src_bbox.ymax + src_bbox.ymin) / 2 if center_x >= sample_bbox.xmin and \ @@ -143,14 +137,14 @@ def meetEmitConstraint(src_bbox, sample_bbox): return False -def transformLabels(bbox_labels, sample_bbox): +def transform_labels(bbox_labels, sample_bbox): proj_bbox = bbox(0, 0, 0, 0) sample_labels = [] for i in range(len(bbox_labels)): sample_label = [] object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2], bbox_labels[i][3], bbox_labels[i][4]) - if not meetEmitConstraint(object_bbox, sample_bbox): + if not meet_emit_constraint(object_bbox, sample_bbox): continue sample_width = sample_bbox.xmax - sample_bbox.xmin sample_height = sample_bbox.ymax - sample_bbox.ymin @@ -158,8 +152,8 @@ def transformLabels(bbox_labels, sample_bbox): proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height - proj_bbox = clipBBox(proj_bbox) - if bboxSize(proj_bbox) > 0: + proj_bbox = clip_bbox(proj_bbox) + if bbox_area(proj_bbox) > 0: sample_label.append(bbox_labels[i][0]) sample_label.append(float(proj_bbox.xmin)) sample_label.append(float(proj_bbox.ymin)) @@ -170,12 +164,12 @@ def transformLabels(bbox_labels, sample_bbox): return sample_labels -def cropImage(img, bbox_labels, sample_bbox, image_width, image_height): - sample_bbox = clipBBox(sample_bbox) +def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): + sample_bbox = clip_bbox(sample_bbox) xmin = int(sample_bbox.xmin * image_width) xmax = int(sample_bbox.xmax * image_width) ymin = int(sample_bbox.ymin * image_height) ymax = int(sample_bbox.ymax * image_height) sample_img = img[ymin:ymax, xmin:xmax] - sample_labels = transformLabels(bbox_labels, sample_bbox) + sample_labels = transform_labels(bbox_labels, sample_bbox) return sample_img, sample_labels diff --git a/ssd/images/SSD300x300_map.png b/ssd/images/SSD300x300_map.png new file mode 100644 index 0000000000000000000000000000000000000000..a40a1e028be7ba979052034c152028976bc4b715 Binary files /dev/null and b/ssd/images/SSD300x300_map.png differ diff --git a/ssd/images/vis_1.jpg b/ssd/images/vis_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c317462ee6053df15fa8d44d0f35398e47156e8d Binary files /dev/null and b/ssd/images/vis_1.jpg differ diff --git a/ssd/images/vis_2.jpg b/ssd/images/vis_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7bc59b239cb9c123087fdecbb210ad52a3a35f10 Binary files /dev/null and b/ssd/images/vis_2.jpg differ diff --git a/ssd/images/vis_3.jpg b/ssd/images/vis_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a79598343a7e2707ba79c2e8891d7af0c24df491 Binary files /dev/null and b/ssd/images/vis_3.jpg differ diff --git a/ssd/images/vis_4.jpg b/ssd/images/vis_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..96b2c99c9ef986cc0d4802b31c33f076fce6f965 Binary files /dev/null and b/ssd/images/vis_4.jpg differ diff --git a/ssd/index.html b/ssd/index.html index 3251d4f22414764559a804d6d7b1af62c132bad9..325ff0c4b7783043f6ea43f1078a9e8e784497c0 100644 --- a/ssd/index.html +++ b/ssd/index.html @@ -48,7 +48,7 @@ SSD使用一个卷积神经网络实现“端到端”的检测,所谓“端 1. 将最后的fc6、fc7全连接层变为卷积层,卷积层参数通过对原始fc6、fc7参数采样得到。 2. 将pool5层的参数由2x2-s2(kernel大小为2x2,stride size为2)更改为3x3-s1-p1(kernel大小为3x3,stride size为1,padding size为1)。 -3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层,priorbox层的主要目的是根据输入的feature map生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。 +3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层,priorbox层的主要目的是根据输入的特征图(feature map)生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。 下图为模型(300x300)的总体结构: @@ -59,12 +59,12 @@ SSD使用一个卷积神经网络实现“端到端”的检测,所谓“端 图中每个矩形盒子代表一个卷积层,最后的两个矩形框分别表示汇总各卷积层输出结果和后处理阶段。具体地,在预测阶段网络会输出一组候选矩形框,每个矩形包含两类信息:位置和类别得分,图中倒数第二个矩形框即表示网络的检测结果的汇总处理,由于候选矩形框数量较多且很多矩形框重叠严重,这时需要经过后处理来筛选出质量较高的少数矩形框,这里的后处理主要指非极大值抑制(Non-maximum Suppression)。 -从SSD的网络结构可以看出,候选矩形框在多个feature map上生成,不同的feature map具有的感受野不同,这样可以在不同尺度扫描图像,相对于其他检测方法可以生成更丰富的候选框,从而提高检测精度;另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算,整个过程只需要一个卷积神经网络完成,所以速度较快。 +从SSD的网络结构可以看出,候选矩形框在多个特征图(feature map上)生成,不同的feature map具有的感受野不同,这样可以在不同尺度扫描图像,相对于其他检测方法可以生成更丰富的候选框,从而提高检测精度;另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算,整个过程只需要一个卷积神经网络完成,所以速度较快。 ## 示例总览 本示例共包含如下文件:
-
表1. 示例文件
+表1. 示例文件 文件 | 用途 ---- | ----- @@ -105,7 +105,7 @@ def prepare_filelist(devkit_dir, years, output_dir): ftest.write(item[0] + ' ' + item[1] + '\n') ``` -该函数首先对每个year的数据进行处理,然后将训练图像的文件路径列表进行随机打乱,最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下,且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据,也包含VOC2012的训练数据,```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例: +该函数首先对每一年(year)的数据进行处理,然后将训练图像的文件路径列表进行随机打乱,最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下,且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据,也包含VOC2012的训练数据,```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例: ``` VOCdevkit/VOC2007/JPEGImages/000005.jpg VOCdevkit/VOC2007/Annotations/000005.xml @@ -141,6 +141,15 @@ train(train_file_list='./data/trainval.txt', 3. 调用```train```执行训练,其中```train_file_list```指定训练数据列表,```dev_file_list```指定评估数据列表,```init_model_path```指定预训练模型位置。 4. 训练过程中会打印一些日志信息,每训练10个batch会输出当前的轮数、当前batch的cost及mAP(mean Average Precision,平均精度均值),每训练一个pass,会保存一次模型,默认保存在```checkpoints```目录下(注:需事先创建)。 +下面给出SDD300x300在VOC数据集(train包括07+12,test为07)上的mAP曲线,迭代140轮mAP可达到71.52%。 + +

+
+ +图2. SSD300x300 mAP收敛曲线 +

+ + ### 模型评估 执行```python eval.py```即可对模型进行评估,```eval.py```的关键执行逻辑如下: @@ -176,7 +185,28 @@ infer( threshold=0.3) ``` -其中```eval_file_list```指定图像路径列表;```save_path```指定预测结果保存路径;```data_args```如上;```batch_size```为每多少样本预测一次;```model_path```指模型的位置;```threshold```为置信度阈值,只有得分大于或等于该值的才会输出。示例还提供了一个可视化脚本,直接运行```python visual.py```即可,须指定输出检测结果路径及输出目录。 +其中```eval_file_list```指定图像路径列表;```save_path```指定预测结果保存路径;```data_args```如上;```batch_size```为每多少样本预测一次;```model_path```指模型的位置;```threshold```为置信度阈值,只有得分大于或等于该值的才会输出。下面给出```infer.res```的一些输出样例: + +``` +VOCdevkit/VOC2007/JPEGImages/006936.jpg 12 0.997844 131.255611777 162.271582842 396.475315094 334.0 +VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.998557 229.160234332 49.5991278887 314.098775387 312.913876176 +VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.372522 187.543615699 133.727034628 345.647156239 327.448492289 +... +``` + +一共包含4个字段,以tab分割,第一个字段是检测图像路径,第二字段为检测矩形框内类别,第三个字段是置信度,第四个字段是4个坐标值(以空格分割)。 + +示例还提供了一个可视化脚本,直接运行```python visual.py```即可,须指定输出检测结果路径及输出目录,默认可视化后图像保存在```./visual_res```,下面是用训练好的模型infer部分图像,并可视化的效果: + +

+ + + +
+ +图2. SSD300x300 检测可视化示例 +

+ ## 自有数据集 在自有数据上训练PaddlePaddle SSD需要完成两个关键准备,首先需要适配网络可以接受的输入格式,这里提供一个推荐的结构,以```train.txt```为例 diff --git a/ssd/infer.py b/ssd/infer.py index 6fdf8fb8a35279087aeeec9ab8b525a4304a09de..c0bc79189935d8bdd59f17756b9c95581870f36a 100644 --- a/ssd/infer.py +++ b/ssd/infer.py @@ -21,11 +21,26 @@ def _infer(inferer, infer_data, threshold): return ret +def save_batch_res(ret_res, img_w, img_h, fname_list, fout): + for det_res in ret_res: + img_idx = int(det_res[0]) + label = int(det_res[1]) + conf_score = det_res[2] + xmin = det_res[3] * img_w[img_idx] + ymin = det_res[4] * img_h[img_idx] + xmax = det_res[5] * img_w[img_idx] + ymax = det_res[6] * img_h[img_idx] + fout.write(fname_list[img_idx] + '\t' + str(label) + '\t' + str( + conf_score) + '\t' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + + ' ' + str(ymax)) + fout.write('\n') + + def infer(eval_file_list, save_path, data_args, batch_size, model_path, threshold): - detect_out = vgg_net_ssd_v2.net_conf(mode='infer') + detect_out = vgg_ssd_net.net_conf(mode='infer') - assert os.path.isfile(init_model_path), 'Invalid model.' + assert os.path.isfile(model_path), 'Invalid model.' parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) inferer = paddle.inference.Inference( @@ -46,24 +61,12 @@ def infer(eval_file_list, save_path, data_args, batch_size, model_path, for img in reader(): test_data.append([img]) fname_list.append(all_fname_list[idx]) - w, h = \ - Image.open(os.path.join('./data', fname_list[-1])).size + w, h = Image.open(os.path.join('./data', fname_list[-1])).size img_w.append(w) img_h.append(h) if len(test_data) == batch_size: ret_res = _infer(inferer, test_data, threshold) - for det_res in ret_res: - img_idx = int(det_res[0]) - label = int(det_res[1]) - conf_score = det_res[2] - xmin = det_res[3] * img_w[img_idx] - ymin = det_res[4] * img_h[img_idx] - xmax = det_res[5] * img_w[img_idx] - ymax = det_res[6] * img_h[img_idx] - fout.write(fname_list[img_idx] + '\t' + str(label) + '\t' + - str(conf_score) + '\t' + str(xmin) + ' ' + str( - ymin) + ' ' + str(xmax) + ' ' + str( - ymax) + '\n') + save_batch_res(ret_res, img_w, img_h, fname_list, fout) test_data = [] fname_list = [] img_w = [] @@ -73,17 +76,7 @@ def infer(eval_file_list, save_path, data_args, batch_size, model_path, if len(test_data) > 0: ret_res = _infer(inferer, test_data, threshold) - for det_res in ret_res: - img_idx = int(det_res[0]) - label = int(det_res[1]) - conf_score = det_res[2] - xmin = det_res[3] * img_w[img_idx] - ymin = det_res[4] * img_h[img_idx] - xmax = det_res[5] * img_w[img_idx] - ymax = det_res[6] * img_h[img_idx] - fout.write(fname_list[img_idx] + '\t' + str(label) + '\t' + str( - conf_score) + '\t' + str(xmin) + ' ' + str(ymin) + ' ' + - str(xmax) + ' ' + str(ymax) + '\n') + save_batch_res(ret_res, img_w, img_h, fname_list, fout) if __name__ == "__main__": diff --git a/ssd/train.py b/ssd/train.py index 4a5de0399fd1e5eed4caf8b35e3744e5e979cdad..783944214b67d15af31267c8ba1ded3fa48e6cb0 100644 --- a/ssd/train.py +++ b/ssd/train.py @@ -8,17 +8,6 @@ from config.pascal_voc_conf import cfg def train(train_file_list, dev_file_list, data_args, init_model_path): - cost, detect_out = vgg_ssd_net.net_conf('train') - - parameters = paddle.parameters.create(cost) - - if not (init_model_path is None): - assert os.path.isfile(init_model_path), 'Invalid model.' - fparams = paddle.parameters.Parameters.from_tar( - gzip.open(init_model_path)) - for param_name in fparams.names(): - parameters.set(param_name, fparams.get(param_name)) - optimizer = paddle.optimizer.Momentum( momentum=cfg.TRAIN.MOMENTUM, learning_rate=cfg.TRAIN.LEARNING_RATE, @@ -28,6 +17,13 @@ def train(train_file_list, dev_file_list, data_args, init_model_path): learning_rate_decay_b=cfg.TRAIN.LEARNING_RATE_DECAY_B, learning_rate_schedule=cfg.TRAIN.LEARNING_RATE_SCHEDULE) + cost, detect_out = vgg_ssd_net.net_conf('train') + + parameters = paddle.parameters.create(cost) + if not (init_model_path is None): + assert os.path.isfile(init_model_path), 'Invalid model.' + parameters.init_from_tar(gzip.open(init_model_path)) + trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, @@ -37,8 +33,7 @@ def train(train_file_list, dev_file_list, data_args, init_model_path): feeding = {'image': 0, 'bbox': 1} train_reader = paddle.batch( - paddle.reader.shuffle( - data_provider.train(data_args, train_file_list), buf_size=200), + data_provider.train(data_args, train_file_list), batch_size=cfg.TRAIN.BATCH_SIZE) # generate a batch image each time dev_reader = paddle.batch( diff --git a/ssd/vgg_ssd_net.py b/ssd/vgg_ssd_net.py index cc7be564ce5997baa86ab4673b86aae06cc80d8d..4a5c107e6fda6e58ff2b27c55bd4773639d36aab 100644 --- a/ssd/vgg_ssd_net.py +++ b/ssd/vgg_ssd_net.py @@ -9,67 +9,56 @@ def net_conf(mode): """ default_l2regularization = cfg.TRAIN.L2REGULARIZATION - default_bias_attr = paddle.attr.ParamAttr( - l2_rate=0.0, learning_rate=2.0, momentum=cfg.TRAIN.MOMENTUM) + default_bias_attr = paddle.attr.ParamAttr(l2_rate=0.0, learning_rate=2.0) default_static_bias_attr = paddle.attr.ParamAttr(is_static=True) - def xavier(channels, filter_size, local_lr, regularization): - init_w = (3.0 / (filter_size**2 * channels))**0.5 + def get_param_attr(local_lr, regularization): is_static = False if local_lr == 0.0: is_static = True return paddle.attr.ParamAttr( - initial_min=(0.0 - init_w), - initial_max=init_w, - learning_rate=local_lr, - l2_rate=regularization, - momentum=cfg.TRAIN.MOMENTUM, - is_static=is_static) + learning_rate=local_lr, l2_rate=regularization, is_static=is_static) + + def conv_group(stack_num, name_list, input, filter_size_list, num_channels, + num_filters_list, stride_list, padding_list, + common_bias_attr, common_param_attr, common_act): + conv = input + in_channels = num_channels + for i in xrange(stack_num): + conv = paddle.layer.img_conv( + name=name_list[i], + input=conv, + filter_size=filter_size_list[i], + num_channels=in_channels, + num_filters=num_filters_list[i], + stride=stride_list[i], + padding=padding_list[i], + bias_attr=common_bias_attr, + param_attr=common_param_attr, + act=common_act) + in_channels = num_filters_list[i] + return conv def vgg_block(idx_str, input, num_channels, num_filters, pool_size, pool_stride, pool_pad): layer_name = "conv%s_" % idx_str - conv1 = paddle.layer.img_conv( - name=layer_name + "1", - input=input, - filter_size=3, - num_channels=num_channels, - num_filters=num_filters, - stride=1, - padding=1, - bias_attr=default_bias_attr, - param_attr=xavier(num_filters, 3, 1, default_l2regularization), - act=paddle.activation.Relu()) - conv2 = paddle.layer.img_conv( - name=layer_name + "2", - input=conv1, - filter_size=3, - num_channels=num_filters, - num_filters=num_filters, - stride=1, - padding=1, - bias_attr=default_bias_attr, - param_attr=xavier(num_filters, 3, 1, default_l2regularization), - act=paddle.activation.Relu()) - conv3 = paddle.layer.img_conv( - name=layer_name + "3", - input=conv2, - filter_size=3, - num_channels=num_filters, - num_filters=num_filters, - stride=1, - padding=1, - bias_attr=default_bias_attr, - param_attr=xavier(num_filters, 3, 1, default_l2regularization), - act=paddle.activation.Relu()) + stack_num = 3 + name_list = [layer_name + str(i + 1) for i in xrange(3)] + + conv = conv_group(stack_num, name_list, input, [3] * stack_num, + num_channels, [num_filters] * stack_num, + [1] * stack_num, [1] * stack_num, default_bias_attr, + get_param_attr(1, default_l2regularization), + paddle.activation.Relu()) + pool = paddle.layer.img_pool( - input=conv3, + input=conv, pool_size=pool_size, num_channels=num_filters, pool_type=paddle.pooling.CudnnMax(), stride=pool_stride, padding=pool_pad) - return conv3, pool + return conv, pool def mbox_block(layer_idx, input, num_channels, filter_size, loc_filters, conf_filters): @@ -83,8 +72,7 @@ def net_conf(mode): stride=1, padding=1, bias_attr=default_bias_attr, - param_attr=xavier(loc_filters, filter_size, 1, - default_l2regularization), + param_attr=get_param_attr(1, default_l2regularization), act=paddle.activation.Identity()) mbox_conf_name = layer_idx + "_mbox_conf" @@ -97,8 +85,7 @@ def net_conf(mode): stride=1, padding=1, bias_attr=default_bias_attr, - param_attr=xavier(conf_filters, filter_size, 1, - default_l2regularization), + param_attr=get_param_attr(1, default_l2regularization), act=paddle.activation.Identity()) return mbox_loc, mbox_conf @@ -106,30 +93,14 @@ def net_conf(mode): def ssd_block(layer_idx, input, img_shape, num_channels, num_filters1, num_filters2, aspect_ratio, variance, min_size, max_size): layer_name = "conv" + layer_idx + "_" + stack_num = 2 conv1_name = layer_name + "1" - conv1 = paddle.layer.img_conv( - name=conv1_name, - input=input, - filter_size=1, - num_channels=num_channels, - num_filters=num_filters1, - stride=1, - padding=0, - bias_attr=default_bias_attr, - param_attr=xavier(num_filters1, 1, 1, default_l2regularization), - act=paddle.activation.Relu()) conv2_name = layer_name + "2" - conv2 = paddle.layer.img_conv( - name=conv2_name, - input=conv1, - filter_size=3, - num_channels=num_filters1, - num_filters=num_filters2, - stride=2, - padding=1, - bias_attr=default_bias_attr, - param_attr=xavier(num_filters2, 3, 1, default_l2regularization), - act=paddle.activation.Relu()) + conv2 = conv_group(stack_num, [conv1_name, conv2_name], input, [1, 3], + num_channels, [num_filters1, num_filters2], [1, 2], + [0, 1], default_bias_attr, + get_param_attr(1, default_l2regularization), + paddle.activation.Relu()) loc_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4 conf_filters = ( @@ -153,28 +124,12 @@ def net_conf(mode): height=cfg.IMG_HEIGHT, width=cfg.IMG_WIDTH) - conv1_1 = paddle.layer.img_conv( - name="conv1_1", - input=img, - filter_size=3, - num_channels=3, - num_filters=64, - stride=1, - padding=1, - bias_attr=default_static_bias_attr, - param_attr=xavier(64, 3, 0, 0), - act=paddle.activation.Relu()) - conv1_2 = paddle.layer.img_conv( - name="conv1_2", - input=conv1_1, - filter_size=3, - num_channels=64, - num_filters=64, - stride=1, - padding=1, - bias_attr=default_static_bias_attr, - param_attr=xavier(64, 3, 0, 0), - act=paddle.activation.Relu()) + stack_num = 2 + conv1_2 = conv_group(stack_num, ['conv1_1', 'conv1_2'], img, + [3] * stack_num, 3, [64] * stack_num, [1] * stack_num, + [1] * stack_num, default_static_bias_attr, + get_param_attr(0, 0), paddle.activation.Relu()) + pool1 = paddle.layer.img_pool( name="pool1", input=conv1_2, @@ -183,28 +138,12 @@ def net_conf(mode): num_channels=64, stride=2) - conv2_1 = paddle.layer.img_conv( - name="conv2_1", - input=pool1, - filter_size=3, - num_channels=64, - num_filters=128, - stride=1, - padding=1, - bias_attr=default_static_bias_attr, - param_attr=xavier(128, 3, 0, 0), - act=paddle.activation.Relu()) - conv2_2 = paddle.layer.img_conv( - name="conv2_2", - input=conv2_1, - filter_size=3, - num_channels=128, - num_filters=128, - stride=1, - padding=1, - bias_attr=default_static_bias_attr, - param_attr=xavier(128, 3, 0, 0), - act=paddle.activation.Relu()) + stack_num = 2 + conv2_2 = conv_group(stack_num, ['conv2_1', 'conv2_2'], pool1, [3] * + stack_num, 64, [128] * stack_num, [1] * stack_num, + [1] * stack_num, default_static_bias_attr, + get_param_attr(0, 0), paddle.activation.Relu()) + pool2 = paddle.layer.img_pool( name="pool2", input=conv2_2, @@ -226,39 +165,18 @@ def net_conf(mode): name="conv4_3_norm", input=conv4_3, param_attr=paddle.attr.ParamAttr( - initial_mean=20, - initial_std=0, - is_static=False, - learning_rate=1, - momentum=cfg.TRAIN.MOMENTUM)) + initial_mean=20, initial_std=0, is_static=False, learning_rate=1)) conv4_3_norm_mbox_loc, conv4_3_norm_mbox_conf = \ mbox_block("conv4_3_norm", conv4_3_norm, 512, 3, 12, 63) conv5_3, pool5 = vgg_block("5", pool4, 512, 512, 3, 1, 1) - fc6 = paddle.layer.img_conv( - name="fc6", - input=pool5, - filter_size=3, - num_channels=512, - num_filters=1024, - stride=1, - padding=1, - bias_attr=default_bias_attr, - param_attr=xavier(1024, 3, 1, default_l2regularization), - act=paddle.activation.Relu()) + stack_num = 2 + fc7 = conv_group(stack_num, ['fc6', 'fc7'], pool5, [3, 1], 512, [1024] * + stack_num, [1] * stack_num, [1, 0], default_bias_attr, + get_param_attr(1, default_l2regularization), + paddle.activation.Relu()) - fc7 = paddle.layer.img_conv( - name="fc7", - input=fc6, - filter_size=1, - num_channels=1024, - num_filters=1024, - stride=1, - padding=0, - bias_attr=default_bias_attr, - param_attr=xavier(1024, 1, 1, default_l2regularization), - act=paddle.activation.Relu()) fc7_mbox_loc, fc7_mbox_conf = mbox_block("fc7", fc7, 1024, 3, 24, 126) fc7_mbox_priorbox = paddle.layer.priorbox( input=fc7,