提交 d79fb66e 编写于 作者: T tianyi1997 提交者: HydrogenSulfate

Create train func & cfg for MetaBIN

上级 0cc6bc0b
# global configs
Global:
checkpoints: null
pretrained_model: null #"metabin_resnet50_final"
output_dir: "./output/"
device: "gpu"
iter_per_epoch: 145
warmup_iter: 10
save_interval: 40
eval_during_train: True
eval_interval: 10
epochs: &epochs 120
print_batch_step: 20
use_visualdl: False
eval_mode: "retrieval"
retrieval_feature_from: "features" # 'backbone' or 'features'
re_ranking: False
# used for static mode and model export
image_shape: [3, 256, 128]
save_inference_dir: "./inference"
train_mode: 'metabin'
# model architecture
Arch:
name: "RecModel"
Backbone:
name: "ResNet50_metabin"
pretrained: False # "metabin_resnet50_backbone_pretrained"
bias_lr_factor: 2.0
gate_lr_factor: 20.0
BackboneStopLayer:
name: "flatten"
Neck:
name: BNNeck
num_features: &feat_dim 2048
weight_attr:
initializer:
name: Constant
value: 1.0
Head:
name: "FC"
embedding_size: *feat_dim
class_num: &class_num 751
weight_attr:
initializer:
name: KaimingUniform
negative_slope: 2.23606 # math.sqrt(5)
nonlinearity: "leaky_relu"
bias_attr: False
# data loader for train and eval
DataLoader:
Train:
dataset:
name: "Market1501"
image_root: "./dataset/"
cls_label_path: "bounding_box_train"
backend: "pil"
transform_ops:
- ResizeImage:
size: [128, 256]
return_numpy: False
interpolation: "bicubic"
backend: "pil"
- RandFlipImage:
flip_code: 1
- Pad:
padding: 10
- RandCropImageV2:
size: [128, 256]
- ColorJitter:
brightness: 0.15
contrast: 0.15
saturation: 0.1
hue: 0.1
- ToTensor:
- Normalize:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
sampler:
name: NaiveIdentityBatchSampler
batch_size: 96
num_instances: 4
drop_last: True
loader:
num_workers: 4
use_shared_memory: True
Metalearning:
Train:
dataset:
name: "Market1501"
image_root: "./dataset/"
cls_label_path: "bounding_box_train"
backend: "pil"
transform_ops:
- ResizeImage:
size: [128, 256]
return_numpy: False
interpolation: "bicubic"
backend: "pil"
- RandFlipImage:
flip_code: 1
- Pad:
padding: 10
- RandCropImageV2:
size: [128, 256]
- ColorJitter:
brightness: 0.15
contrast: 0.15
saturation: 0.1
hue: 0.1
- ToTensor:
- Normalize:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
sampler:
name: DomainShuffleBatchSampler
batch_size: 96
num_instances: 4
drop_last: True
camera_to_domain: True
loader:
num_workers: 4
use_shared_memory: True
Eval:
Query:
dataset:
name: "DukeMTMC"
image_root: "./dataset/"
cls_label_path: "query"
backend: "pil"
transform_ops:
- ResizeImage:
size: [128, 256]
return_numpy: False
interpolation: "bicubic"
backend: "pil"
- ToTensor:
- Normalize:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
sampler:
name: DistributedBatchSampler
batch_size: 128
drop_last: False
shuffle: False
loader:
num_workers: 4
use_shared_memory: True
Gallery:
dataset:
name: "DukeMTMC"
image_root: "./dataset/"
cls_label_path: "bounding_box_test"
backend: "pil"
transform_ops:
- ResizeImage:
size: [128, 256]
return_numpy: False
interpolation: "bicubic"
backend: "pil"
- ToTensor:
- Normalize:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
sampler:
name: DistributedBatchSampler
batch_size: 128
drop_last: False
shuffle: False
loader:
num_workers: 4
use_shared_memory: True
# loss function config for traing/eval process
Loss:
Train:
- CELossForMetaBIN:
weight: 1.0
epsilon: 0.1
- TripletLossForMetaBIN:
weight: 1.0
margin: 0.3
feature_from: "backbone"
- IntraDomainScatterLoss:
weight: 1.0
normalize_feature: True
feature_from: "backbone"
- InterDomainShuffleLoss:
weight: 1.0
normalize_feature: False
feature_from: "backbone"
Basic:
- CELossForMetaBIN:
weight: 1.0
epsilon: 0.1
- TripletLossForMetaBIN:
weight: 1.0
margin: 0.3
feature_from: "backbone"
MetaTrain:
- CELossForMetaBIN:
weight: 1.0
epsilon: 0.1
- TripletLossForMetaBIN:
weight: 1.0
margin: 0.3
feature_from: "backbone"
- IntraDomainScatterLoss:
weight: 1.0
normalize_feature: True
feature_from: "backbone"
- InterDomainShuffleLoss:
weight: 1.0
normalize_feature: False
feature_from: "backbone"
MetaTest:
- CELossForMetaBIN:
weight: 1.0
epsilon: 0.1
- TripletLossForMetaBIN:
weight: 1.0
margin: 0.3
feature_from: "backbone"
Eval:
- TripletLossForMetaBIN:
weight: 1.0
margin: 0.3
feature_from: "backbone"
Optimizer:
- Momentum:
scope: ".*(conv|batch_norm|instance_norm|feat_bn|fc)"
lr:
name: MultiStepDecay
epochs: *epochs
learning_rate: 0.01
step_each_epoch: 145
milestones: [50, 90]
gamma: 0.1
warmup_epoch: 10
warmup_start_lr: 0.0001
by_epoch: False
last_epoch: -1
momentum: 0.9
regularizer:
name: "L2"
coeff: 0.0005
- Momentum:
scope: "backbone.*gate"
lr:
name: Constant
learning_rate: 0.01
last_epoch: 0
momentum: 0.9
- SGD:
scope: "RecModel"
lr:
name: Cyclic
epochs: *epochs
step_each_epoch: 145
base_learning_rate: 0.001
max_learning_rate: 0.1
warmup_epoch: 0
warmup_start_lr: 1
step_size_up: 1095
step_size_down: 1095
by_epoch: False
last_epoch: 0
AMP:
scale_loss: 65536
use_dynamic_loss_scaling: True
Metric:
Eval:
- Recallk:
topk: [1, 5, 10]
- mAP: {}
\ No newline at end of file
...@@ -15,3 +15,4 @@ from ppcls.engine.train.train import train_epoch ...@@ -15,3 +15,4 @@ from ppcls.engine.train.train import train_epoch
from ppcls.engine.train.train_fixmatch import train_epoch_fixmatch from ppcls.engine.train.train_fixmatch import train_epoch_fixmatch
from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl
from ppcls.engine.train.train_progressive import train_epoch_progressive from ppcls.engine.train.train_progressive import train_epoch_progressive
from ppcls.engine.train.train_metabin import train_epoch_metabin
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# reference: https://arxiv.org/abs/2011.14670v2
from __future__ import absolute_import, division, print_function
import time
import paddle
import numpy as np
from collections import defaultdict
from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name
from ppcls.utils import profiler
from ppcls.data import build_dataloader
from ppcls.arch.backbone.variant_models.resnet_variant import MetaBIN, BINGate
from ppcls.loss import build_loss
def train_epoch_metabin(engine, epoch_id, print_batch_step):
tic = time.time()
if not hasattr(engine, "train_dataloader_iter"):
engine.train_dataloader_iter = iter(engine.train_dataloader)
if not hasattr(engine, "meta_dataloader"):
engine.meta_dataloader = build_dataloader(
config=engine.config['DataLoader']['Metalearning'],
mode='Train',
device=engine.device)
engine.meta_dataloader_iter = iter(engine.meta_dataloader)
num_domain = engine.train_dataloader.dataset.num_cams
for iter_id in range(engine.iter_per_epoch):
# fetch data batch from dataloader
try:
train_batch = next(engine.train_dataloader_iter)
except Exception:
engine.train_dataloader_iter = iter(engine.train_dataloader)
train_batch = next(engine.train_dataloader_iter)
try:
mtrain_batch, mtest_batch = get_meta_data(
engine.meta_dataloader_iter, num_domain)
except Exception:
engine.meta_dataloader_iter = iter(engine.meta_dataloader)
mtrain_batch, mtest_batch = get_meta_data(
engine.meta_dataloader_iter, num_domain)
profiler.add_profiler_step(engine.config["profiler_options"])
if iter_id == 5:
for key in engine.time_info:
engine.time_info[key].reset()
engine.time_info["reader_cost"].update(time.time() - tic)
train_batch_size = train_batch[0].shape[0]
mtrain_batch_size = mtrain_batch[0].shape[0]
mtest_batch_size = mtest_batch[0].shape[0]
if not engine.config["Global"].get("use_multilabel", False):
train_batch[1] = train_batch[1].reshape([train_batch_size, -1])
mtrain_batch[1] = mtrain_batch[1].reshape([mtrain_batch_size, -1])
mtest_batch[1] = mtest_batch[1].reshape([mtest_batch_size, -1])
engine.global_step += 1
if engine.global_step == 1: # update model (without gate) to warmup
for i in range(engine.config["Global"]["warmup_iter"] - 1):
out, basic_loss_dict = basic_update(engine, train_batch)
loss_dict = basic_loss_dict
try:
train_batch = next(engine.train_dataloader_iter)
except Exception:
engine.train_dataloader_iter = iter(
engine.train_dataloader)
train_batch = next(engine.train_dataloader_iter)
out, basic_loss_dict = basic_update(engine=engine, batch=train_batch)
mtrain_loss_dict, mtest_loss_dict = metalearning_update(
engine=engine, mtrain_batch=mtrain_batch, mtest_batch=mtest_batch)
loss_dict = {
**
{"train_" + key: value
for key, value in basic_loss_dict.items()}, ** {
"mtrain_" + key: value
for key, value in mtrain_loss_dict.items()
}, **
{"mtest_" + key: value
for key, value in mtest_loss_dict.items()}
}
# step lr (by iter)
# the last lr_sch is cyclic_lr
for i in range(len(engine.lr_sch) - 1):
if not getattr(engine.lr_sch[i], "by_epoch", False):
engine.lr_sch[i].step()
# update ema
if engine.ema:
engine.model_ema.update(engine.model)
# below code just for logging
# update metric_for_logger
update_metric(engine, out, train_batch, train_batch_size)
# update_loss_for_logger
update_loss(engine, loss_dict, train_batch_size)
engine.time_info["batch_cost"].update(time.time() - tic)
if iter_id % print_batch_step == 0:
log_info(engine, train_batch_size, epoch_id, iter_id)
tic = time.time()
# step lr(by epoch)
# the last lr_sch is cyclic_lr
for i in range(len(engine.lr_sch) - 1):
if getattr(engine.lr_sch[i], "by_epoch", False) and \
type_name(engine.lr_sch[i]) != "ReduceOnPlateau":
engine.lr_sch[i].step()
def setup_opt(engine, stage):
assert stage in ["train", "mtrain", "mtest"]
opt = defaultdict()
if stage == "train":
opt["bn_mode"] = "general"
opt["enable_inside_update"] = False
opt["lr_gate"] = 0.0
elif stage == "mtrain":
opt["bn_mode"] = "hold"
opt["enable_inside_update"] = False
opt["lr_gate"] = 0.0
elif stage == "mtest":
norm_lr = engine.lr_sch[1].last_lr
cyclic_lr = engine.lr_sch[2].get_lr()
engine.lr_sch[2].step() # update cyclic learning rate
opt["bn_mode"] = "hold"
opt["enable_inside_update"] = True
opt["lr_gate"] = norm_lr * cyclic_lr
for layer in engine.model.sublayers():
if isinstance(layer, MetaBIN):
layer.setup_opt(opt)
def reset_opt(model):
for layer in model.sublayers():
if isinstance(layer, MetaBIN):
layer.reset_opt()
def get_meta_data(meta_dataloader_iter, num_domain):
"""
fetch data batch from dataloader then divide the batch by domains
"""
list_all = np.random.permutation(num_domain)
list_mtrain = list(list_all[:num_domain // 2])
batch = next(meta_dataloader_iter)
domain_idx = batch[2]
cnt = 0
for sample in list_mtrain:
if cnt == 0:
is_mtrain_domain = domain_idx == sample
else:
is_mtrain_domain = paddle.logical_or(is_mtrain_domain,
domain_idx == sample)
cnt += 1
# mtrain_batch
if not any(is_mtrain_domain):
mtrain_batch = None
raise RuntimeError
else:
mtrain_batch = dict()
mtrain_batch = [batch[i][is_mtrain_domain] for i in range(len(batch))]
# mtest_batch
is_mtest_domains = is_mtrain_domain == False
if not any(is_mtest_domains):
mtest_batch = None
raise RuntimeError
else:
mtest_batch = dict()
mtest_batch = [batch[i][is_mtest_domains] for i in range(len(batch))]
return mtrain_batch, mtest_batch
def forward(engine, batch, loss_func):
batch_info = defaultdict()
batch_info = {"label": batch[1], "domain": batch[2]}
amp_level = engine.config["AMP"].get("level", "O1").upper()
with paddle.amp.auto_cast(
custom_black_list={"flatten_contiguous_range", "greater_than"},
level=amp_level):
out = engine.model(batch[0], batch[1])
loss_dict = loss_func(out, batch_info)
return out, loss_dict
def backward(engine, loss, optimizer):
scaled = engine.scaler.scale(loss)
scaled.backward()
engine.scaler.minimize(optimizer, scaled)
for layer in engine.model.sublayers():
if isinstance(layer, BINGate):
layer.clip_gate()
def basic_update(engine, batch):
setup_opt(engine, "train")
train_loss_func = build_loss(engine.config["Loss"]["Basic"])
out, train_loss_dict = forward(engine, batch, train_loss_func)
train_loss = train_loss_dict["loss"]
backward(engine, train_loss, engine.optimizer[0])
engine.optimizer[0].clear_grad()
reset_opt(engine.model)
return out, train_loss_dict
def metalearning_update(engine, mtrain_batch, mtest_batch):
# meta train
mtrain_loss_func = build_loss(engine.config["Loss"]["MetaTrain"])
setup_opt(engine, "mtrain")
mtrain_batch_info = defaultdict()
mtrain_batch_info = {"label": mtrain_batch[1], "domain": mtrain_batch[2]}
out = engine.model(mtrain_batch[0], mtrain_batch[1])
mtrain_loss_dict = mtrain_loss_func(out, mtrain_batch_info)
mtrain_loss = mtrain_loss_dict["loss"]
engine.optimizer[1].clear_grad()
mtrain_loss.backward()
# meta test
mtest_loss_func = build_loss(engine.config["Loss"]["MetaTest"])
setup_opt(engine, "mtest")
out, mtest_loss_dict = forward(engine, mtest_batch, mtest_loss_func)
engine.optimizer[1].clear_grad()
mtest_loss = mtest_loss_dict["loss"]
backward(engine, mtest_loss, engine.optimizer[1])
engine.optimizer[0].clear_grad()
engine.optimizer[1].clear_grad()
reset_opt(engine.model)
return mtrain_loss_dict, mtest_loss_dict
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册