未验证 提交 cea47a39 编写于 作者: L leaves-zwx 提交者: GitHub

Merge pull request #2 from Oneflow-Inc/merge_jxf

Merge jxf
......@@ -32,9 +32,12 @@ DATASETS:
TRAIN: ("coco_2017_train", "coco_2017_val")
TEST: ("coco_2017_val",)
DATALOADER:
# xfjiang: for comparing loss curve
ASPECT_RATIO_GROUPING: True
SIZE_DIVISIBILITY: 32
SOLVER:
BASE_LR: 0.02
BASE_LR: 0.0025
WEIGHT_DECAY: 0.0001
STEPS: (60000, 80000)
MAX_ITER: 90000
STEPS: (480000, 640000)
MAX_ITER: 1
IMS_PER_BATCH: 1
DATALOADER:
ASPECT_RATIO_GROUPING: True
NUM_WORKERS: 4
SIZE_DIVISIBILITY: 32
DATASETS:
TEST: ('coco_1_image_train',)
TRAIN: ('coco_1_image_train', 'coco_1_image_train')
INPUT:
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN: 800
PIXEL_MEAN: [102.9801, 115.9465, 122.7717]
PIXEL_STD: [1.0, 1.0, 1.0]
TO_BGR255: True
MODEL:
BACKBONE:
CONV_BODY: R-50-FPN
FREEZE_CONV_BODY_AT: 2
OUT_CHANNELS: 256
DEVICE: cuda
MASK_ON: True
META_ARCHITECTURE: GeneralizedRCNN
RESNETS:
NUM_GROUPS: 1
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_FUNC: StemWithFixedBatchNorm
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: True
TRANS_FUNC: BottleneckWithFixedBatchNorm
WIDTH_PER_GROUP: 64
ROI_BOX_HEAD:
FEATURE_EXTRACTOR: FPN2MLPFeatureExtractor
MLP_HEAD_DIM: 1024
NUM_CLASSES: 81
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 2
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
PREDICTOR: FPNPredictor
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
BBOX_REG_WEIGHTS: (10.0, 10.0, 5.0, 5.0)
BG_IOU_THRESHOLD: 0.5
DETECTIONS_PER_IMG: 100
FG_IOU_THRESHOLD: 0.5
NMS: 0.5
POSITIVE_FRACTION: 0.25
SCORE_THRESH: 0.05
USE_FPN: True
ROI_MASK_HEAD:
CONV_LAYERS: (256, 256, 256, 256)
FEATURE_EXTRACTOR: MaskRCNNFPNFeatureExtractor
MLP_HEAD_DIM: 1024
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 2
POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
POSTPROCESS_MASKS: False
POSTPROCESS_MASKS_THRESHOLD: 0.5
PREDICTOR: MaskRCNNC4Predictor
RESOLUTION: 28
SHARE_BOX_FEATURE_EXTRACTOR: False
RPN:
ANCHOR_SIZES: (32, 64, 128, 256, 512)
ANCHOR_STRIDE: (4, 8, 16, 32, 64)
ASPECT_RATIOS: (0.5, 1.0, 2.0)
BATCH_SIZE_PER_IMAGE: 256
BG_IOU_THRESHOLD: 0.3
FG_IOU_THRESHOLD: 0.7
FPN_POST_NMS_TOP_N_TEST: 1000
FPN_POST_NMS_TOP_N_TRAIN: 2000
MIN_SIZE: 0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOP_N_TEST: 1000
POST_NMS_TOP_N_TRAIN: 2000
PRE_NMS_TOP_N_TEST: 1000
PRE_NMS_TOP_N_TRAIN: 2000
RPN_HEAD: SingleConvRPNHead
STRADDLE_THRESH: 0
USE_FPN: True
RPN_ONLY: False
# WEIGHT: catalog://ImageNetPretrained/MSRA/R-50
WEIGHT: /home/xfjiang/repos/oneflow_toolkit/oneflow_toolkit/detection/convert_pytorch_model_to_of_model/examples/mask_rcnn_R_50_FPN_1x/e2e_mask_rcnn_R_50_FPN_1x.pth
OUTPUT_DIR: .
PATHS_CATALOG: /home/xfjiang/repos/maskrcnn-benchmark/maskrcnn_benchmark/config/paths_catalog.py
SOLVER:
BASE_LR: 0.0025
BIAS_LR_FACTOR: 2
CHECKPOINT_PERIOD: 2500
GAMMA: 0.1
IMS_PER_BATCH: 1
MAX_ITER: 90000
MOMENTUM: 0.9
STEPS: (480000, 640000)
WARMUP_FACTOR: 0.333333333333
WARMUP_ITERS: 500
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.0001
WEIGHT_DECAY_BIAS: 0
TEST:
EXPECTED_RESULTS: []
EXPECTED_RESULTS_SIGMA_TOL: 4
IMS_PER_BATCH: 8
\ No newline at end of file
......@@ -7,6 +7,14 @@ import os
class DatasetCatalog(object):
DATA_DIR = "/dataset"
DATASETS = {
"coco_1_image_train": {
"img_dir": "mscoco_2017/train2017",
"ann_file": "mscoco_2017/annotations/sample_1_instances_train2017.json"
},
"coco_1_image_val": {
"img_dir": "mscoco_2017/val2017",
"ann_file": "mscoco_2017/annotations/sample_1_instances_val2017.json"
},
"coco_2017_train": {
"img_dir": "mscoco_2017/train2017",
"ann_file": "mscoco_2017/annotations/instances_train2017.json"
......
......@@ -155,7 +155,9 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
data_loaders = []
for dataset in datasets:
sampler = make_data_sampler(dataset, shuffle, is_distributed)
# sampler = make_data_sampler(dataset, shuffle, is_distributed)
# xfjiang: use sequential sampler for comparing loss curve
sampler = torch.utils.data.sampler.SequentialSampler(dataset)
batch_sampler = make_batch_data_sampler(
dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
)
......
......@@ -6,7 +6,7 @@ def build_transforms(cfg, is_train=True):
if is_train:
min_size = cfg.INPUT.MIN_SIZE_TRAIN
max_size = cfg.INPUT.MAX_SIZE_TRAIN
flip_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN
flip_prob = 0.0 # cfg.INPUT.FLIP_PROB_TRAIN
else:
min_size = cfg.INPUT.MIN_SIZE_TEST
max_size = cfg.INPUT.MAX_SIZE_TEST
......
# -*- coding: utf-8 -*
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import datetime
import logging
......@@ -9,6 +11,12 @@ import torch.distributed as dist
from maskrcnn_benchmark.utils.comm import get_world_size
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
import numpy as np
import os
import maskrcnn_benchmark
from maskrcnn_benchmark.structures.bounding_box import BoxList
def reduce_loss_dict(loss_dict):
"""
......@@ -53,8 +61,127 @@ def do_train(
model.train()
start_training_time = time.time()
end = time.time()
module2name = {}
for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
# xfjiang: replace images
oneflow_images = np.load('/dataset/mask_rcnn/sample_1_train_of_decoded_blobs/encoded_(1, 1280, 800, 3).npy')
images.tensors = torch.tensor(np.transpose(oneflow_images, (0, 3, 1, 2)))
data_time = time.time() - end
def save_tensor(path, name, entity_to_save):
if type(entity_to_save) is tuple:
# print 'tuple!!!'
for idx, item in enumerate(entity_to_save):
# tuple of lists
if (type(item) is list):
# print '\t tuple of lists'
for idx_2, item_2 in enumerate(item):
# tuple of lists of Tensors
if type(item_2) is torch.Tensor:
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" \
+ str(idx) + "_" + str(idx_2) + "." + str(item_2.size()), \
item_2.detach().cpu().numpy())
# tuple of lists of lists
elif type(item_2) is list:
for idx_3, item_3 in enumerate(item_2):
if (type(item_3) is maskrcnn_benchmark.structures.bounding_box.BoxList):
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + \
str(idx) + "_" + str(idx_2) + "_" + str(idx_3) + "." + \
str(item_3.bbox.size()), item_3.bbox.detach().cpu().numpy())
else:
assert False
# tuple of lists of BoxList
elif type(item_2) is maskrcnn_benchmark.structures.bounding_box.BoxList:
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + str(idx) + "_" + \
str(idx_2) + "." + str(item_2.bbox.size()), item_2.bbox.detach().cpu().numpy())
else:
assert False
# tuple of dicts
elif (type(item) is dict):
# print '\t tuple of dicts'
for item_2 in item.items():
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + str(idx) + "_" + \
item_2[0] + "." + str(item_2[1].size()), item_2[1].detach().cpu().numpy())
# tuple of tuples
elif (type(item) is tuple):
# print '\t tuple of tuples'
for idx_2, item_2 in enumerate(item):
# tuple of tuples of Tensors
if (type(item_2) is torch.Tensor):
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + str(idx) + \
"_" + str(idx_2) + "." + str(item_2.size()), item_2.detach().cpu().numpy())
else:
assert False
# tuple of ImageList
elif (type(item) is maskrcnn_benchmark.structures.image_list.ImageList):
# print '\t tuple of ImageList'
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + str(idx) + \
"." + str(item.tensors.size()), item.tensors.detach().cpu().numpy())
# tuple of Tensors
elif (type(item) is torch.Tensor):
# print '\t tuple of Tensors'
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + \
str(idx) + "." + str(item.size()), item.detach().cpu().numpy())
# tuple of Nones, do not save Nones
elif (type(item) is type(None)):
# print '\t tuple of Nones'
pass
else:
assert False
elif type(entity_to_save) is list:
# print 'list!!!'
for idx, item in enumerate(entity_to_save):
# list of Tensors
if (type(iter) is torch.Tensor):
# print '\t list of Tensors'
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "_" + \
str(idx) + "." + str(item.size()), item.detach().cpu().numpy())
elif type(entity_to_save) is torch.Tensor:
# print 'torch.Tensor!!!'
np.save(path + "/" + "iter-" + str(iteration - 1) + "." + name + "." + \
str(entity_to_save.size()), entity_to_save.cpu().detach().cpu().numpy())
else:
assert False
def fw_callback(module, input, output):
module_name = module2name[module]
# print 'We are in ' + module_name + "'s fw_callback function."
path = 'dump' + module_name
if not os.path.exists(path):
os.makedirs(path)
save_tensor(path, "in", input)
save_tensor(path, "out", output)
return
def bw_callback(module, grad_input, grad_output):
module_name = module2name[module]
# print 'We are in ' + module_name + "'s bw_callback function."
path = 'dump' + module_name
if not os.path.exists(path):
os.makedirs(path)
save_tensor(path, "in_diff", grad_input)
save_tensor(path, "out_diff", grad_output)
return
def register_callback_rec_for_all_modules(module, prefix=""):
for (n, m) in module.named_children():
new_prefix = prefix + "/" + n
module2name[m] = new_prefix
# print new_prefix
# print("registering callback for " + new_prefix)
m.register_forward_hook(fw_callback)
m.register_backward_hook(bw_callback)
register_callback_rec_for_all_modules(m, new_prefix)
def register_callback_rec_for_particular_modules(module, names, prefix=""):
for (n, m) in module.named_children():
new_prefix = prefix + "/" + n
if n in names or new_prefix in names:
module2name[m] = new_prefix
# print("registering callback for " + new_prefix)
m.register_forward_hook(fw_callback)
m.register_backward_hook(bw_callback)
register_callback_rec_for_particular_modules(m, names, new_prefix)
# save modules' in, out, in_diff, out_diff
# register_callback_rec_for_all_modules(model)
# register_callback_rec_for_particular_modules(model, ['/backbone', '/rpn', '/roi_heads'])
iteration = iteration + 1
arguments["iteration"] = iteration
......
......@@ -3,6 +3,7 @@ import torch
import torch.nn.functional as F
from torch import nn
import numpy as np
class FPN(nn.Module):
"""
......@@ -66,6 +67,11 @@ class FPN(nn.Module):
last_results = self.top_blocks(results[-1])
results.extend(last_results)
# xfjiang: save blobs
for idx, x in enumerate(results):
save_path = "./new_dump/backbone/fpn" + str(idx + 1) + "-out" + "." + str(x.size())
np.save(save_path, x.detach().cpu().numpy())
return tuple(results)
......
......@@ -20,6 +20,8 @@ from maskrcnn_benchmark.layers import FrozenBatchNorm2d
from maskrcnn_benchmark.layers import Conv2d
from maskrcnn_benchmark.utils.registry import Registry
import os
import numpy as np
# ResNet stage specification
StageSpec = namedtuple(
......@@ -115,10 +117,23 @@ class ResNet(nn.Module):
def forward(self, x):
outputs = []
x = self.stem(x)
# xfjiang: save blobs
if not os.path.exists('./new_dump/backbone'):
os.makedirs('./new_dump/backbone')
save_path = "./new_dump/backbone/backbone-stem-out" + "." + str(x.size())
np.save(save_path, x.detach().cpu().numpy())
for stage_name in self.stages:
x = getattr(self, stage_name)(x)
if self.return_features[stage_name]:
outputs.append(x)
# xfjiang: save blobs
for idx, x in enumerate(outputs):
save_path = "./new_dump/backbone/backbone-layer" + str(idx + 1) + "-out" + "." + str(x.size())
np.save(save_path, x.detach().cpu().numpy())
return outputs
......
......@@ -15,6 +15,7 @@ from maskrcnn_benchmark.modeling.matcher import Matcher
from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou
from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
import numpy as np
class RPNLossComputation(object):
"""
......@@ -79,15 +80,46 @@ class RPNLossComputation(object):
"""
Arguments:
anchors (list[BoxList])
example:
[
[BoxList(num_boxes=163200, image_width=1066, image_height=800, mode=xyxy),
BoxList(num_boxes=40800, image_width=1066, image_height=800, mode=xyxy),
BoxList(num_boxes=10200, image_width=1066, image_height=800, mode=xyxy),
BoxList(num_boxes=2550, image_width=1066, image_height=800, mode=xyxy),
BoxList(num_boxes=663, image_width=1066, image_height=800, mode=xyxy)]
]
objectness (list[Tensor])
example:
list of <class 'torch.Tensor'>, len(objectness) == 5
shape of each torch.Tensor:
[(1, 3, 200, 272),
(1, 3, 100, 136),
(1, 3, 50, 68),
(1, 3, 25, 34),
(1, 3, 13, 17)]
box_regression (list[Tensor])
example:
list of <class 'torch.Tensor'>, len(bbox_regression) == 5
[(1, 12, 200, 272),
(1, 12, 100, 136),
(1, 12, 50, 68),
(1, 12, 25, 34),
(1, 12, 13, 17)]
targets (list[BoxList])
example:
[BoxList(num_boxes=23, image_width=1066, image_height=800, mode=xyxy)]
Returns:
objectness_loss (Tensor)
box_loss (Tensor
box_loss (Tensor)
"""
anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]
# xfjiang: save blobs
generated_anchors_save_path = "./new_dump/rpn/generated_anchors" + "." + str(anchors[0].bbox.size())
np.save(generated_anchors_save_path, anchors[0].bbox.detach().cpu().numpy())
labels, regression_targets = self.prepare_targets(anchors, targets)
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
......@@ -122,6 +154,24 @@ class RPNLossComputation(object):
labels = torch.cat(labels, dim=0)
regression_targets = torch.cat(regression_targets, dim=0)
# xfjiang: save blobs
sampled_objectness_npy = objectness[sampled_inds].cpu().detach().numpy()
sampled_objectness_save_path = "./new_dump/rpn/sampled_objectness" + "." + str(sampled_objectness_npy.shape)
np.save(sampled_objectness_save_path, sampled_objectness_npy)
sampled_labels_npy = labels[sampled_inds].cpu().detach().numpy()
sampled_labels_save_path = "./new_dump/rpn/sampled_labels" + "." + str(sampled_labels_npy.shape)
np.save(sampled_labels_save_path, sampled_labels_npy)
sampled_box_regression_npy = box_regression[sampled_pos_inds].cpu().detach().numpy()
sampled_box_regression_save_path = "./new_dump/rpn/sampled_box_regression" + "." + str(sampled_box_regression_npy.shape)
np.save(sampled_box_regression_save_path, sampled_box_regression_npy)
sampled_regression_targets_npy = regression_targets[sampled_pos_inds].cpu().detach().numpy()
sampled_regression_targets_save_path = "./new_dump/rpn/sampled_regression_targets" + "." + str(sampled_regression_targets_npy.shape)
np.save(sampled_regression_targets_save_path, sampled_regression_targets_npy)
sampled_pos_inds_save_path = "./new_dump/rpn/sampled_pos_inds" + "." + str(sampled_pos_inds.size())
np.save(sampled_pos_inds_save_path, sampled_pos_inds.cpu().detach().numpy())
sampled_inds_save_path = "./new_dump/rpn/sampled_inds" + "." + str(sampled_inds.size())
np.save(sampled_inds_save_path, sampled_inds.cpu().detach().numpy())
box_loss = smooth_l1_loss(
box_regression[sampled_pos_inds],
regression_targets[sampled_pos_inds],
......
......@@ -9,6 +9,8 @@ from .loss import make_rpn_loss_evaluator
from .anchor_generator import make_anchor_generator
from .inference import make_rpn_postprocessor
import numpy as np
import os
@registry.RPN_HEADS.register("SingleConvRPNHead")
class RPNHead(nn.Module):
......@@ -115,9 +117,27 @@ class RPNModule(torch.nn.Module):
boxes = self.box_selector_train(
anchors, objectness, rpn_box_regression, targets
)
# xfjiang: save blobs
if not os.path.exists("./new_dump/rpn/"):
os.makedirs("./new_dump/rpn/")
# print(type(boxes)) # list
# print(len(boxes)) # 1
# print(type(boxes[0])) # <class 'maskrcnn_benchmark.structures.bounding_box.BoxList'>
bbox = boxes[0].bbox
bbox_save_path = "./new_dump/rpn/bbox" + "." + str(bbox.size())
np.save(bbox_save_path, bbox.detach().cpu().numpy())
loss_objectness, loss_rpn_box_reg = self.loss_evaluator(
anchors, objectness, rpn_box_regression, targets
)
# xfjiang: save blobs
loss_objectness_save_path = "./new_dump/rpn/" + "rpn_objectiness_loss" + "." + str(loss_objectness.size())
loss_rpn_box_reg_save_path = "./new_dump/rpn/" + "rpn_box_loss" + "." + str(loss_objectness.size())
np.save(loss_objectness_save_path ,loss_objectness.detach().cpu().numpy())
np.save(loss_rpn_box_reg_save_path, loss_rpn_box_reg.detach().cpu().numpy())
losses = {
"loss_objectness": loss_objectness,
"loss_rpn_box_reg": loss_rpn_box_reg,
......
python ./tools/train_net.py \
--config-file "./configs/customized_e2e_mask_rcnn_R_50_C4_1x.yaml"
rm -r ./dump
rm -r ./new_dump
rm last_checkpoint
rm model_final.pth
rm log.txt
rm model_0090000.pth
# export NGPUS=1
# python -m torch.distributed.launch --nproc_per_node=$NGPUS \
# ./tools/train_net.py\
# --config-file "./configs/customized_e2e_mask_rcnn_R_50_FPN_1x_all.yaml" \
# --skip-test
python ./tools/train_net.py\
--config-file "./configs/customized_e2e_mask_rcnn_R_50_FPN_1x_all.yaml" \
--skip-test
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册