From aea712cc87006dda43bdde156ee647166ea3f03a Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Wed, 5 Jan 2022 19:25:36 +0800 Subject: [PATCH] add dist of rec model (#1574) * add distillation loss func and rec distillation --- ppcls/arch/__init__.py | 7 +- .../backbone/legendary_models/mobilenet_v3.py | 8 +- .../GeneralRecognition_PPLCNet_x2_5_dml.yaml | 194 ++++++++++++++++++ .../GeneralRecognition_PPLCNet_x2_5_udml.yaml | 193 +++++++++++++++++ ...mv3_large_x1_0_distill_mv3_small_x1_0.yaml | 8 +- ppcls/engine/engine.py | 7 +- ppcls/engine/evaluation/classification.py | 7 +- ppcls/engine/evaluation/retrieval.py | 2 + ppcls/loss/__init__.py | 2 + ppcls/loss/distillationloss.py | 39 +++- ppcls/loss/dmlloss.py | 24 ++- ppcls/loss/rkdloss.py | 97 +++++++++ 12 files changed, 564 insertions(+), 24 deletions(-) create mode 100644 ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml create mode 100644 ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml create mode 100644 ppcls/loss/rkdloss.py diff --git a/ppcls/arch/__init__.py b/ppcls/arch/__init__.py index f2c2e412..0c45cf6f 100644 --- a/ppcls/arch/__init__.py +++ b/ppcls/arch/__init__.py @@ -77,14 +77,19 @@ class RecModel(TheseusLayer): self.head = None def forward(self, x, label=None): + out = dict() x = self.backbone(x) + out["backbone"] = x if self.neck is not None: x = self.neck(x) + out["features"] = x if self.head is not None: y = self.head(x, label) + out["neck"] = x else: y = None - return {"features": x, "logits": y} + out["logits"] = y + return out class DistillationModel(nn.Layer): diff --git a/ppcls/arch/backbone/legendary_models/mobilenet_v3.py b/ppcls/arch/backbone/legendary_models/mobilenet_v3.py index 1ad42d5c..36661abf 100644 --- a/ppcls/arch/backbone/legendary_models/mobilenet_v3.py +++ b/ppcls/arch/backbone/legendary_models/mobilenet_v3.py @@ -196,7 +196,10 @@ class MobileNetV3(TheseusLayer): bias_attr=False) self.hardswish = nn.Hardswish() - self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer") + if dropout_prob is not None: + self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer") + else: + self.dropout = None self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) self.fc = Linear(self.class_expand, class_num) @@ -210,7 +213,8 @@ class MobileNetV3(TheseusLayer): x = self.avg_pool(x) x = self.last_conv(x) x = self.hardswish(x) - x = self.dropout(x) + if self.dropout is not None: + x = self.dropout(x) x = self.flatten(x) x = self.fc(x) diff --git a/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml new file mode 100644 index 00000000..c8973b06 --- /dev/null +++ b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml @@ -0,0 +1,194 @@ +# global configs +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: true + eval_interval: 1 + epochs: 100 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + eval_mode: retrieval + use_dali: False + to_static: False + +# model architecture +Arch: + name: "DistillationModel" + infer_output_key: features + infer_add_softmax: False + is_rec: True + infer_model_name: "Student" + # if not null, its lengths should be same as models + pretrained_list: + # if not null, its lengths should be same as models + freeze_params_list: + - False + - False + models: + - Teacher: + name: RecModel + infer_output_key: features + infer_add_softmax: False + Backbone: + name: PPLCNet_x2_5 + pretrained: True + use_ssld: True + BackboneStopLayer: + name: "flatten" + Neck: + name: FC + embedding_size: 1280 + class_num: 512 + Head: + name: ArcMargin + embedding_size: 512 + class_num: 185341 + margin: 0.2 + scale: 30 + - Student: + name: RecModel + infer_output_key: features + infer_add_softmax: False + Backbone: + name: PPLCNet_x2_5 + pretrained: True + use_ssld: True + BackboneStopLayer: + name: "flatten" + Neck: + name: FC + embedding_size: 1280 + class_num: 512 + Head: + name: ArcMargin + embedding_size: 512 + class_num: 185341 + margin: 0.2 + scale: 30 + +# loss function config for traing/eval process +Loss: + Train: + - DistillationGTCELoss: + weight: 1.0 + key: "logits" + model_names: ["Student", "Teacher"] + - DistillationDMLLoss: + weight: 1.0 + key: "logits" + model_name_pairs: + - ["Student", "Teacher"] + - DistillationDMLLoss: + weight: 1.0 + key: "logits" + model_name_pairs: + - ["Student", "Teacher"] + Eval: + - DistillationGTCELoss: + weight: 1.0 + model_names: ["Student"] + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Cosine + learning_rate: 0.02 + warmup_epoch: 5 + regularizer: + name: 'L2' + coeff: 0.00001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ + cls_label_path: ./dataset/train_reg_all_data.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + Query: + dataset: + name: VeriWild + image_root: ./dataset/Aliproduct/ + cls_label_path: ./dataset/Aliproduct/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + size: 224 + - NormalizeImage: + scale: 0.00392157 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + + Gallery: + dataset: + name: VeriWild + image_root: ./dataset/Aliproduct/ + cls_label_path: ./dataset/Aliproduct/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + size: 224 + - NormalizeImage: + scale: 0.00392157 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Metric: + Eval: + - Recallk: + topk: [1, 5] diff --git a/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml new file mode 100644 index 00000000..bcaea03b --- /dev/null +++ b/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml @@ -0,0 +1,193 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: true + eval_interval: 1 + epochs: 100 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + eval_mode: retrieval + use_dali: False + to_static: False + +# model architecture +Arch: + name: "DistillationModel" + infer_output_key: features + infer_add_softmax: False + is_rec: True + infer_model_name: "Student" + # if not null, its lengths should be same as models + pretrained_list: + # if not null, its lengths should be same as models + freeze_params_list: + - False + - False + models: + - Teacher: + name: RecModel + infer_output_key: features + infer_add_softmax: False + Backbone: + name: PPLCNet_x2_5 + pretrained: True + use_ssld: True + BackboneStopLayer: + name: "flatten" + Neck: + name: FC + embedding_size: 1280 + class_num: 512 + Head: + name: ArcMargin + embedding_size: 512 + class_num: 185341 + margin: 0.2 + scale: 30 + - Student: + name: RecModel + infer_output_key: features + infer_add_softmax: False + Backbone: + name: PPLCNet_x2_5 + pretrained: True + use_ssld: True + BackboneStopLayer: + name: "flatten" + Neck: + name: FC + embedding_size: 1280 + class_num: 512 + Head: + name: ArcMargin + embedding_size: 512 + class_num: 185341 + margin: 0.2 + scale: 30 + +# loss function config for traing/eval process +Loss: + Train: + - DistillationGTCELoss: + weight: 1.0 + key: "logits" + model_names: ["Student", "Teacher"] + - DistillationDMLLoss: + weight: 1.0 + key: "logits" + model_name_pairs: + - ["Student", "Teacher"] + - DistillationDistanceLoss: + weight: 1.0 + key: "backbone" + model_name_pairs: + - ["Student", "Teacher"] + Eval: + - DistillationGTCELoss: + weight: 1.0 + model_names: ["Student"] + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Cosine + learning_rate: 0.02 + warmup_epoch: 5 + regularizer: + name: 'L2' + coeff: 0.00001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ + cls_label_path: ./dataset/train_reg_all_data.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + Query: + dataset: + name: VeriWild + image_root: ./dataset/Aliproduct/ + cls_label_path: ./dataset/Aliproduct/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + size: 224 + - NormalizeImage: + scale: 0.00392157 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + + Gallery: + dataset: + name: VeriWild + image_root: ./dataset/Aliproduct/ + cls_label_path: ./dataset/Aliproduct/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + size: 224 + - NormalizeImage: + scale: 0.00392157 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Metric: + Eval: + - Recallk: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml index 3a96e58a..e7147694 100644 --- a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml +++ b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml @@ -13,6 +13,7 @@ Global: # used for static mode and model export image_shape: [3, 224, 224] save_inference_dir: "./inference" + use_dali: false # model architecture Arch: @@ -29,9 +30,11 @@ Arch: name: MobileNetV3_large_x1_0 pretrained: True use_ssld: True + dropout_prob: null - Student: name: MobileNetV3_small_x1_0 pretrained: False + dropout_prob: null infer_model_name: "Student" @@ -76,7 +79,6 @@ DataLoader: size: 224 - RandFlipImage: flip_code: 1 - - AutoAugment: - NormalizeImage: scale: 0.00392157 mean: [0.485, 0.456, 0.406] @@ -85,7 +87,7 @@ DataLoader: sampler: name: DistributedBatchSampler - batch_size: 512 + batch_size: 256 drop_last: False shuffle: True loader: @@ -112,7 +114,7 @@ DataLoader: order: '' sampler: name: DistributedBatchSampler - batch_size: 64 + batch_size: 128 drop_last: False shuffle: False loader: diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 21897e3a..cbd70a49 100644 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -53,7 +53,8 @@ class Engine(object): self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") - if "Head" in self.config["Arch"]: + if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec", + False): self.is_rec = True else: self.is_rec = False @@ -357,7 +358,9 @@ class Engine(object): out = self.model(batch_tensor) if isinstance(out, list): out = out[0] - if isinstance(out, dict): + if isinstance(out, dict) and "logits" in out: + out = out["logits"] + if isinstance(out, dict) and "output" in out: out = out["output"] result = self.postprocess_func(out, image_file_list) print(result) diff --git a/ppcls/engine/evaluation/classification.py b/ppcls/engine/evaluation/classification.py index 2a71de9c..98bad639 100644 --- a/ppcls/engine/evaluation/classification.py +++ b/ppcls/engine/evaluation/classification.py @@ -78,10 +78,10 @@ def classification_eval(engine, epoch_id=0): labels = paddle.concat(label_list, 0) if isinstance(out, dict): - if "logits" in out: - out = out["logits"] - elif "Student" in out: + if "Student" in out: out = out["Student"] + elif "logits" in out: + out = out["logits"] else: msg = "Error: Wrong key in out!" raise Exception(msg) @@ -106,6 +106,7 @@ def classification_eval(engine, epoch_id=0): metric_dict = engine.eval_metric_func(pred, labels) else: metric_dict = engine.eval_metric_func(out, batch[1]) + for key in metric_dict: if metric_key is None: metric_key = key diff --git a/ppcls/engine/evaluation/retrieval.py b/ppcls/engine/evaluation/retrieval.py index bae77743..8471a42c 100644 --- a/ppcls/engine/evaluation/retrieval.py +++ b/ppcls/engine/evaluation/retrieval.py @@ -123,6 +123,8 @@ def cal_feature(engine, name='gallery'): has_unique_id = True batch[2] = batch[2].reshape([-1, 1]).astype("int64") out = engine.model(batch[0], batch[1]) + if "Student" in out: + out = out["Student"] batch_feas = out["features"] # do norm diff --git a/ppcls/loss/__init__.py b/ppcls/loss/__init__.py index 68739de2..d15dab9d 100644 --- a/ppcls/loss/__init__.py +++ b/ppcls/loss/__init__.py @@ -20,6 +20,8 @@ from .distanceloss import DistanceLoss from .distillationloss import DistillationCELoss from .distillationloss import DistillationGTCELoss from .distillationloss import DistillationDMLLoss +from .distillationloss import DistillationDistanceLoss +from .distillationloss import DistillationRKDLoss from .multilabelloss import MultiLabelLoss from .deephashloss import DSHSDLoss, LCDSHLoss diff --git a/ppcls/loss/distillationloss.py b/ppcls/loss/distillationloss.py index 54dc601b..ab6187f5 100644 --- a/ppcls/loss/distillationloss.py +++ b/ppcls/loss/distillationloss.py @@ -18,6 +18,7 @@ import paddle.nn as nn from .celoss import CELoss from .dmlloss import DMLLoss from .distanceloss import DistanceLoss +from .rkdloss import RKdAngle, RkdDistance class DistillationCELoss(CELoss): @@ -68,7 +69,7 @@ class DistillationGTCELoss(CELoss): def forward(self, predicts, batch): loss_dict = dict() - for idx, name in enumerate(self.model_names): + for _, name in enumerate(self.model_names): out = predicts[name] if self.key is not None: out = out[self.key] @@ -84,7 +85,7 @@ class DistillationDMLLoss(DMLLoss): def __init__(self, model_name_pairs=[], - act=None, + act="softmax", key=None, name="loss_dml"): super().__init__(act=act) @@ -125,7 +126,7 @@ class DistillationDistanceLoss(DistanceLoss): assert isinstance(model_name_pairs, list) self.key = key self.model_name_pairs = model_name_pairs - self.name = name + "_l2" + self.name = name + mode def forward(self, predicts, batch): loss_dict = dict() @@ -139,3 +140,35 @@ class DistillationDistanceLoss(DistanceLoss): for key in loss: loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[key] return loss_dict + + +class DistillationRKDLoss(nn.Layer): + def __init__(self, + target_size=None, + model_name_pairs=(["Student", "Teacher"], ), + student_keepkeys=[], + teacher_keepkeys=[]): + super().__init__() + self.student_keepkeys = student_keepkeys + self.teacher_keepkeys = teacher_keepkeys + self.model_name_pairs = model_name_pairs + assert len(self.student_keepkeys) == len(self.teacher_keepkeys) + + self.rkd_angle_loss = RKdAngle(target_size=target_size) + self.rkd_dist_loss = RkdDistance(target_size=target_size) + + def __call__(self, predicts, batch): + loss_dict = {} + for m1, m2 in self.model_name_pairs: + for idx, ( + student_name, teacher_name + ) in enumerate(zip(self.student_keepkeys, self.teacher_keepkeys)): + student_out = predicts[m1][student_name] + teacher_out = predicts[m2][teacher_name] + + loss_dict[f"loss_angle_{idx}_{m1}_{m2}"] = self.rkd_angle_loss( + student_out, teacher_out) + loss_dict[f"loss_dist_{idx}_{m1}_{m2}"] = self.rkd_dist_loss( + student_out, teacher_out) + + return loss_dict diff --git a/ppcls/loss/dmlloss.py b/ppcls/loss/dmlloss.py index d8bb833d..16ea7646 100644 --- a/ppcls/loss/dmlloss.py +++ b/ppcls/loss/dmlloss.py @@ -22,7 +22,7 @@ class DMLLoss(nn.Layer): DMLLoss """ - def __init__(self, act="softmax"): + def __init__(self, act="softmax", eps=1e-12): super().__init__() if act is not None: assert act in ["softmax", "sigmoid"] @@ -32,15 +32,19 @@ class DMLLoss(nn.Layer): self.act = nn.Sigmoid() else: self.act = None + self.eps = eps - def forward(self, out1, out2): - if self.act is not None: - out1 = self.act(out1) - out2 = self.act(out2) + def _kldiv(self, x, target): + class_num = x.shape[-1] + cost = target * paddle.log( + (target + self.eps) / (x + self.eps)) * class_num + return cost - log_out1 = paddle.log(out1) - log_out2 = paddle.log(out2) - loss = (F.kl_div( - log_out1, out2, reduction='batchmean') + F.kl_div( - log_out2, out1, reduction='batchmean')) / 2.0 + def forward(self, x, target): + if self.act is not None: + x = F.softmax(x) + target = F.softmax(target) + loss = self._kldiv(x, target) + self._kldiv(target, x) + loss = loss / 2 + loss = paddle.mean(loss) return {"DMLLoss": loss} diff --git a/ppcls/loss/rkdloss.py b/ppcls/loss/rkdloss.py new file mode 100644 index 00000000..e6ffea27 --- /dev/null +++ b/ppcls/loss/rkdloss.py @@ -0,0 +1,97 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def pdist(e, squared=False, eps=1e-12): + e_square = e.pow(2).sum(axis=1) + prod = paddle.mm(e, e.t()) + res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clip( + min=eps) + + if not squared: + res = res.sqrt() + return res + + +class RKdAngle(nn.Layer): + # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py + def __init__(self, target_size=None): + super().__init__() + if target_size is not None: + self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size) + else: + self.avgpool = None + + def forward(self, student, teacher): + # GAP to reduce memory + if self.avgpool is not None: + # NxC1xH1xW1 -> NxC1x1x1 + student = self.avgpool(student) + # NxC2xH2xW2 -> NxC2x1x1 + teacher = self.avgpool(teacher) + + # reshape for feature map distillation + bs = student.shape[0] + student = student.reshape([bs, -1]) + teacher = teacher.reshape([bs, -1]) + + td = (teacher.unsqueeze(0) - teacher.unsqueeze(1)) + norm_td = F.normalize(td, p=2, axis=2) + t_angle = paddle.bmm(norm_td, norm_td.transpose([0, 2, 1])).reshape( + [-1, 1]) + + sd = (student.unsqueeze(0) - student.unsqueeze(1)) + norm_sd = F.normalize(sd, p=2, axis=2) + s_angle = paddle.bmm(norm_sd, norm_sd.transpose([0, 2, 1])).reshape( + [-1, 1]) + loss = F.smooth_l1_loss(s_angle, t_angle, reduction='mean') + return loss + + +class RkdDistance(nn.Layer): + # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py + def __init__(self, eps=1e-12, target_size=1): + super().__init__() + self.eps = eps + if target_size is not None: + self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size) + else: + self.avgpool = None + + def forward(self, student, teacher): + # GAP to reduce memory + if self.avgpool is not None: + # NxC1xH1xW1 -> NxC1x1x1 + student = self.avgpool(student) + # NxC2xH2xW2 -> NxC2x1x1 + teacher = self.avgpool(teacher) + + bs = student.shape[0] + student = student.reshape([bs, -1]) + teacher = teacher.reshape([bs, -1]) + + t_d = pdist(teacher, squared=False) + mean_td = t_d.mean() + t_d = t_d / (mean_td + self.eps) + + d = pdist(student, squared=False) + mean_d = d.mean() + d = d / (mean_d + self.eps) + + loss = F.smooth_l1_loss(d, t_d, reduction="mean") + return loss -- GitLab