From 879603101b7523b73d3675619ae7ece574da0364 Mon Sep 17 00:00:00 2001 From: wangxinxin08 Date: Fri, 9 Dec 2022 09:45:39 +0000 Subject: [PATCH] fix bugs --- ppdet/modeling/vl/embedder/__init__.py | 1 + ppdet/modeling/vl/embedder/clip/__init__.py | 2 +- ppdet/modeling/vl/embedder/clip/clip.py | 2 +- ppdet/modeling/vl/embedder/clip/layers.py | 10 +++++----- ppdet/modeling/vl/embedder/clip/models.py | 20 +++++++++++-------- ppdet/modeling/vl/head/__init__.py | 2 ++ ppdet/modeling/vl/head/owl_vit_head.py | 7 ++++--- ppdet/modeling/vl/loss/__init__.py | 2 ++ ppdet/modeling/vl/loss/owl_vit_loss.py | 2 +- .../modeling/vl/tokenizer/simple_tokenizer.py | 1 + 10 files changed, 30 insertions(+), 19 deletions(-) diff --git a/ppdet/modeling/vl/embedder/__init__.py b/ppdet/modeling/vl/embedder/__init__.py index 9e28baadf..864638732 100644 --- a/ppdet/modeling/vl/embedder/__init__.py +++ b/ppdet/modeling/vl/embedder/__init__.py @@ -21,6 +21,7 @@ import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register +from .clip import * __all__ = ['ClipImageTextEmbedder'] diff --git a/ppdet/modeling/vl/embedder/clip/__init__.py b/ppdet/modeling/vl/embedder/clip/__init__.py index 4cf4e7bf7..185d771fe 100644 --- a/ppdet/modeling/vl/embedder/clip/__init__.py +++ b/ppdet/modeling/vl/embedder/clip/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .models import ModifiedResNet, TextEncoder, VisionTransformer +from .models import ModifiedResNet, TextEncoder, ViT from .layers import LayerNorm, QuickGELU, AttentionPool2D from .clip import CLIP diff --git a/ppdet/modeling/vl/embedder/clip/clip.py b/ppdet/modeling/vl/embedder/clip/clip.py index 8d6d01808..6e15f142e 100644 --- a/ppdet/modeling/vl/embedder/clip/clip.py +++ b/ppdet/modeling/vl/embedder/clip/clip.py @@ -31,7 +31,7 @@ from ppdet.modeling.layers import MultiHeadAttention from ppdet.modeling.initializer import zeros_, normal_ from ppdet.core.workspace import register -from .models import ModifiedResNet, VisionTransformer, TextEncoder +from .models import ModifiedResNet, ViT, TextEncoder @register diff --git a/ppdet/modeling/vl/embedder/clip/layers.py b/ppdet/modeling/vl/embedder/clip/layers.py index fca8c8815..eee7eb50d 100644 --- a/ppdet/modeling/vl/embedder/clip/layers.py +++ b/ppdet/modeling/vl/embedder/clip/layers.py @@ -84,7 +84,7 @@ class Bottleneck(nn.Layer): return out -class AttentionPool2D(nn.Module): +class AttentionPool2D(nn.Layer): def __init__(self, spacial_dim, embed_dim, num_heads, output_dim): super().__init__() # TODO: need check whether it is consistent with torch or not @@ -151,10 +151,9 @@ class ResidualAttentionBlock(nn.Layer): self.attn = MultiHeadAttention(d_model, n_head) self.ln_1 = LayerNorm(d_model) - self.mlp = nn.Sequential( - OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ( - "gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model) - )])) + self.mlp = nn.Sequential(("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model))) self.ln_2 = LayerNorm(d_model) self.attn_mask = attn_mask self.droplayer_p = droplayer_p @@ -192,6 +191,7 @@ class Transformer(nn.Layer): super().__init__() self.width = width self.layers = layers + self.stochastic_droplayer_rate = stochastic_droplayer_rate blocks = [] for i in range(self.layers): droplayer_p = (i / max(self.layers - 1, diff --git a/ppdet/modeling/vl/embedder/clip/models.py b/ppdet/modeling/vl/embedder/clip/models.py index 49ee8d007..d4af77fee 100644 --- a/ppdet/modeling/vl/embedder/clip/models.py +++ b/ppdet/modeling/vl/embedder/clip/models.py @@ -32,7 +32,7 @@ from ppdet.core.workspace import register from .layers import * -__all__ = ['ModifiedResNet', 'VisionTransformer', 'TextEncoder'] +__all__ = ['ModifiedResNet', 'ViT', 'TextEncoder'] @register @@ -105,7 +105,7 @@ class ModifiedResNet(nn.Layer): @register -class VisionTransformer(nn.Layer): +class ViT(nn.Layer): def __init__(self, input_resolution, patch_size, @@ -115,6 +115,7 @@ class VisionTransformer(nn.Layer): output_dim=None, stochastic_droplayer_rate=0.0): super().__init__() + self.width = width self.input_resolution = input_resolution self.output_dim = output_dim self.conv1 = nn.Conv2D( @@ -122,7 +123,7 @@ class VisionTransformer(nn.Layer): out_channels=width, kernel_size=patch_size, stride=patch_size, - bias=False) + bias_attr=False) scale = width**-0.5 self.class_embedding = self.create_parameter( shape=[width], attr=ParamAttr(initializer=Normal(std=scale))) @@ -157,9 +158,14 @@ class VisionTransformer(nn.Layer): @register class TextEncoder(nn.Layer): - def __init__(self, context_length, vocab_size, transformer_width, - transformer_heads, transformer_layers, - stochastic_droplayer_rate): + def __init__(self, + embed_dim, + context_length, + vocab_size, + transformer_width, + transformer_heads, + transformer_layers, + stochastic_droplayer_rate=0.0): super().__init__() self.context_length = context_length @@ -178,8 +184,6 @@ class TextEncoder(nn.Layer): self.ln_final = LayerNorm(transformer_width) self.text_projection = nn.Linear( transformer_width, embed_dim, bias_attr=False) - self.logit_scale = self.create_parameter( - shape=[], attr=ParamAttr(initializer=Constant(np.log(1. / 0.07)))) def build_attention_mask(self): # lazily create causal attention mask, with full attention between the vision tokens diff --git a/ppdet/modeling/vl/head/__init__.py b/ppdet/modeling/vl/head/__init__.py index 97043fd7b..2de8a9ab6 100644 --- a/ppdet/modeling/vl/head/__init__.py +++ b/ppdet/modeling/vl/head/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .owl_vit_head import * \ No newline at end of file diff --git a/ppdet/modeling/vl/head/owl_vit_head.py b/ppdet/modeling/vl/head/owl_vit_head.py index 560744329..fafa20f7d 100644 --- a/ppdet/modeling/vl/head/owl_vit_head.py +++ b/ppdet/modeling/vl/head/owl_vit_head.py @@ -22,6 +22,7 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.modeling.ops import get_act_fn +from ppdet.core.workspace import register from ..utils import compute_box_bias @@ -46,12 +47,13 @@ class PredictorMLP(nn.Layer): in_dim, out_dim, num_layers, - mlp_dim, - hidden_activation, + mlp_dim=None, + hidden_activation='gelu', out_activation=None): super().__init__() layers = [] + mlp_dim = in_dim if mlp_dim is None else mlp_dim for _ in range(num_layers - 1): layers.append(nn.Linear(in_dim, mlp_dim)) in_dim = mlp_dim @@ -138,7 +140,6 @@ class OWLViTHead(nn.Layer): self.class_head = class_head self.bbox_head = bbox_head self.box_bias = box_bias - self.matcher = matcher self.loss = loss def box_predictor(self, image_features, feature_map): diff --git a/ppdet/modeling/vl/loss/__init__.py b/ppdet/modeling/vl/loss/__init__.py index 97043fd7b..1b6789619 100644 --- a/ppdet/modeling/vl/loss/__init__.py +++ b/ppdet/modeling/vl/loss/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .owl_vit_loss import * \ No newline at end of file diff --git a/ppdet/modeling/vl/loss/owl_vit_loss.py b/ppdet/modeling/vl/loss/owl_vit_loss.py index b5fdfd92f..0d11c44f2 100644 --- a/ppdet/modeling/vl/loss/owl_vit_loss.py +++ b/ppdet/modeling/vl/loss/owl_vit_loss.py @@ -32,7 +32,7 @@ class OWLViTLoss(nn.Layer): __inject__ = ['HungarianMatcher'] def __init__(self, - num_classes, + num_classes=80, matcher='HungarianMatcher', normalization='per_example', loss_coeff=None, diff --git a/ppdet/modeling/vl/tokenizer/simple_tokenizer.py b/ppdet/modeling/vl/tokenizer/simple_tokenizer.py index 723da452d..f82b7b315 100644 --- a/ppdet/modeling/vl/tokenizer/simple_tokenizer.py +++ b/ppdet/modeling/vl/tokenizer/simple_tokenizer.py @@ -21,6 +21,7 @@ from __future__ import print_function import gzip import html import os +import functools from functools import lru_cache import ftfy -- GitLab