提交 87960310 编写于 作者: W wangxinxin08

fix bugs

上级 e142c6a3
...@@ -21,6 +21,7 @@ import paddle.nn as nn ...@@ -21,6 +21,7 @@ import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from ppdet.core.workspace import register from ppdet.core.workspace import register
from .clip import *
__all__ = ['ClipImageTextEmbedder'] __all__ = ['ClipImageTextEmbedder']
......
...@@ -12,6 +12,6 @@ ...@@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .models import ModifiedResNet, TextEncoder, VisionTransformer from .models import ModifiedResNet, TextEncoder, ViT
from .layers import LayerNorm, QuickGELU, AttentionPool2D from .layers import LayerNorm, QuickGELU, AttentionPool2D
from .clip import CLIP from .clip import CLIP
...@@ -31,7 +31,7 @@ from ppdet.modeling.layers import MultiHeadAttention ...@@ -31,7 +31,7 @@ from ppdet.modeling.layers import MultiHeadAttention
from ppdet.modeling.initializer import zeros_, normal_ from ppdet.modeling.initializer import zeros_, normal_
from ppdet.core.workspace import register from ppdet.core.workspace import register
from .models import ModifiedResNet, VisionTransformer, TextEncoder from .models import ModifiedResNet, ViT, TextEncoder
@register @register
......
...@@ -84,7 +84,7 @@ class Bottleneck(nn.Layer): ...@@ -84,7 +84,7 @@ class Bottleneck(nn.Layer):
return out return out
class AttentionPool2D(nn.Module): class AttentionPool2D(nn.Layer):
def __init__(self, spacial_dim, embed_dim, num_heads, output_dim): def __init__(self, spacial_dim, embed_dim, num_heads, output_dim):
super().__init__() super().__init__()
# TODO: need check whether it is consistent with torch or not # TODO: need check whether it is consistent with torch or not
...@@ -151,10 +151,9 @@ class ResidualAttentionBlock(nn.Layer): ...@@ -151,10 +151,9 @@ class ResidualAttentionBlock(nn.Layer):
self.attn = MultiHeadAttention(d_model, n_head) self.attn = MultiHeadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model) self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential( self.mlp = nn.Sequential(("c_fc", nn.Linear(d_model, d_model * 4)),
OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ( ("gelu", QuickGELU()),
"gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model) ("c_proj", nn.Linear(d_model * 4, d_model)))
)]))
self.ln_2 = LayerNorm(d_model) self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask self.attn_mask = attn_mask
self.droplayer_p = droplayer_p self.droplayer_p = droplayer_p
...@@ -192,6 +191,7 @@ class Transformer(nn.Layer): ...@@ -192,6 +191,7 @@ class Transformer(nn.Layer):
super().__init__() super().__init__()
self.width = width self.width = width
self.layers = layers self.layers = layers
self.stochastic_droplayer_rate = stochastic_droplayer_rate
blocks = [] blocks = []
for i in range(self.layers): for i in range(self.layers):
droplayer_p = (i / max(self.layers - 1, droplayer_p = (i / max(self.layers - 1,
......
...@@ -32,7 +32,7 @@ from ppdet.core.workspace import register ...@@ -32,7 +32,7 @@ from ppdet.core.workspace import register
from .layers import * from .layers import *
__all__ = ['ModifiedResNet', 'VisionTransformer', 'TextEncoder'] __all__ = ['ModifiedResNet', 'ViT', 'TextEncoder']
@register @register
...@@ -105,7 +105,7 @@ class ModifiedResNet(nn.Layer): ...@@ -105,7 +105,7 @@ class ModifiedResNet(nn.Layer):
@register @register
class VisionTransformer(nn.Layer): class ViT(nn.Layer):
def __init__(self, def __init__(self,
input_resolution, input_resolution,
patch_size, patch_size,
...@@ -115,6 +115,7 @@ class VisionTransformer(nn.Layer): ...@@ -115,6 +115,7 @@ class VisionTransformer(nn.Layer):
output_dim=None, output_dim=None,
stochastic_droplayer_rate=0.0): stochastic_droplayer_rate=0.0):
super().__init__() super().__init__()
self.width = width
self.input_resolution = input_resolution self.input_resolution = input_resolution
self.output_dim = output_dim self.output_dim = output_dim
self.conv1 = nn.Conv2D( self.conv1 = nn.Conv2D(
...@@ -122,7 +123,7 @@ class VisionTransformer(nn.Layer): ...@@ -122,7 +123,7 @@ class VisionTransformer(nn.Layer):
out_channels=width, out_channels=width,
kernel_size=patch_size, kernel_size=patch_size,
stride=patch_size, stride=patch_size,
bias=False) bias_attr=False)
scale = width**-0.5 scale = width**-0.5
self.class_embedding = self.create_parameter( self.class_embedding = self.create_parameter(
shape=[width], attr=ParamAttr(initializer=Normal(std=scale))) shape=[width], attr=ParamAttr(initializer=Normal(std=scale)))
...@@ -157,9 +158,14 @@ class VisionTransformer(nn.Layer): ...@@ -157,9 +158,14 @@ class VisionTransformer(nn.Layer):
@register @register
class TextEncoder(nn.Layer): class TextEncoder(nn.Layer):
def __init__(self, context_length, vocab_size, transformer_width, def __init__(self,
transformer_heads, transformer_layers, embed_dim,
stochastic_droplayer_rate): context_length,
vocab_size,
transformer_width,
transformer_heads,
transformer_layers,
stochastic_droplayer_rate=0.0):
super().__init__() super().__init__()
self.context_length = context_length self.context_length = context_length
...@@ -178,8 +184,6 @@ class TextEncoder(nn.Layer): ...@@ -178,8 +184,6 @@ class TextEncoder(nn.Layer):
self.ln_final = LayerNorm(transformer_width) self.ln_final = LayerNorm(transformer_width)
self.text_projection = nn.Linear( self.text_projection = nn.Linear(
transformer_width, embed_dim, bias_attr=False) transformer_width, embed_dim, bias_attr=False)
self.logit_scale = self.create_parameter(
shape=[], attr=ParamAttr(initializer=Constant(np.log(1. / 0.07))))
def build_attention_mask(self): def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens # lazily create causal attention mask, with full attention between the vision tokens
......
...@@ -11,3 +11,5 @@ ...@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .owl_vit_head import *
\ No newline at end of file
...@@ -22,6 +22,7 @@ import paddle ...@@ -22,6 +22,7 @@ import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from ppdet.modeling.ops import get_act_fn from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register
from ..utils import compute_box_bias from ..utils import compute_box_bias
...@@ -46,12 +47,13 @@ class PredictorMLP(nn.Layer): ...@@ -46,12 +47,13 @@ class PredictorMLP(nn.Layer):
in_dim, in_dim,
out_dim, out_dim,
num_layers, num_layers,
mlp_dim, mlp_dim=None,
hidden_activation, hidden_activation='gelu',
out_activation=None): out_activation=None):
super().__init__() super().__init__()
layers = [] layers = []
mlp_dim = in_dim if mlp_dim is None else mlp_dim
for _ in range(num_layers - 1): for _ in range(num_layers - 1):
layers.append(nn.Linear(in_dim, mlp_dim)) layers.append(nn.Linear(in_dim, mlp_dim))
in_dim = mlp_dim in_dim = mlp_dim
...@@ -138,7 +140,6 @@ class OWLViTHead(nn.Layer): ...@@ -138,7 +140,6 @@ class OWLViTHead(nn.Layer):
self.class_head = class_head self.class_head = class_head
self.bbox_head = bbox_head self.bbox_head = bbox_head
self.box_bias = box_bias self.box_bias = box_bias
self.matcher = matcher
self.loss = loss self.loss = loss
def box_predictor(self, image_features, feature_map): def box_predictor(self, image_features, feature_map):
......
...@@ -11,3 +11,5 @@ ...@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .owl_vit_loss import *
\ No newline at end of file
...@@ -32,7 +32,7 @@ class OWLViTLoss(nn.Layer): ...@@ -32,7 +32,7 @@ class OWLViTLoss(nn.Layer):
__inject__ = ['HungarianMatcher'] __inject__ = ['HungarianMatcher']
def __init__(self, def __init__(self,
num_classes, num_classes=80,
matcher='HungarianMatcher', matcher='HungarianMatcher',
normalization='per_example', normalization='per_example',
loss_coeff=None, loss_coeff=None,
......
...@@ -21,6 +21,7 @@ from __future__ import print_function ...@@ -21,6 +21,7 @@ from __future__ import print_function
import gzip import gzip
import html import html
import os import os
import functools
from functools import lru_cache from functools import lru_cache
import ftfy import ftfy
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册