diff --git a/mrcnn/config.py b/mrcnn/config.py index a651c1885003abae0ac910e15bb9ddbf9217d368..4b9e0475de478f310c70622e9053309552bed097 100644 --- a/mrcnn/config.py +++ b/mrcnn/config.py @@ -56,6 +56,12 @@ class Config(object): # are based on a Resnet101 backbone. BACKBONE_STRIDES = [4, 8, 16, 32, 64] + # Size of the fully-connected layers in the classification graph + FPN_CLASSIF_FC_LAYERS_SIZE = 1024 + + # Size of the top-down layers used to build the feature pyramid + TOP_DOWN_PYRAMID_SIZE = 256 + # Number of classification classes (including background) NUM_CLASSES = 1 # Override in sub-classes diff --git a/mrcnn/model.py b/mrcnn/model.py index 76d329e1ca132a3a671b146333a8555e3774ea98..a5a70c9982f25e91b044094c464be9c22079714b 100644 --- a/mrcnn/model.py +++ b/mrcnn/model.py @@ -898,7 +898,8 @@ def build_rpn_model(anchor_stride, anchors_per_location, depth): ############################################################ def fpn_classifier_graph(rois, feature_maps, image_meta, - pool_size, num_classes, train_bn=True): + pool_size, num_classes, train_bn=True, + fc_layers_size=1024): """Builds the computation graph of the feature pyramid network classifier and regressor heads. @@ -910,6 +911,7 @@ def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size: The width of the square feature map generated from ROI Pooling. num_classes: number of classes, which determines the depth of the results train_bn: Boolean. Train or freeze Batch Norm layres + fc_layers_size: Size of the 2 FC layers Returns: logits: [N, NUM_CLASSES] classifier logits (before softmax) @@ -922,11 +924,11 @@ def fpn_classifier_graph(rois, feature_maps, image_meta, x = PyramidROIAlign([pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps) # Two 1024 FC layers (implemented with Conv2D for consistency) - x = KL.TimeDistributed(KL.Conv2D(1024, (pool_size, pool_size), padding="valid"), + x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"), name="mrcnn_class_conv1")(x) x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn) x = KL.Activation('relu')(x) - x = KL.TimeDistributed(KL.Conv2D(1024, (1, 1)), + x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)), name="mrcnn_class_conv2")(x) x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn) x = KL.Activation('relu')(x) @@ -1887,21 +1889,21 @@ class MaskRCNN(): stage5=True, train_bn=config.TRAIN_BN) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config - P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5) + P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), - KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)]) + KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), - KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)]) + KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), - KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)]) + KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)]) # Attach 3x3 conv to all P layers to get the final feature maps. - P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2) - P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3) - P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4) - P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5) + P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) + P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) + P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) + P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) @@ -1923,7 +1925,7 @@ class MaskRCNN(): # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, - len(config.RPN_ANCHOR_RATIOS), 256) + len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: @@ -1980,7 +1982,8 @@ class MaskRCNN(): mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, - train_bn=config.TRAIN_BN) + train_bn=config.TRAIN_BN, + fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, @@ -2019,7 +2022,8 @@ class MaskRCNN(): mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, - train_bn=config.TRAIN_BN) + train_bn=config.TRAIN_BN, + fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in