提交 87960310 编写于 作者: W wangxinxin08

fix bugs

上级 e142c6a3
......@@ -21,6 +21,7 @@ import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .clip import *
__all__ = ['ClipImageTextEmbedder']
......
......@@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .models import ModifiedResNet, TextEncoder, VisionTransformer
from .models import ModifiedResNet, TextEncoder, ViT
from .layers import LayerNorm, QuickGELU, AttentionPool2D
from .clip import CLIP
......@@ -31,7 +31,7 @@ from ppdet.modeling.layers import MultiHeadAttention
from ppdet.modeling.initializer import zeros_, normal_
from ppdet.core.workspace import register
from .models import ModifiedResNet, VisionTransformer, TextEncoder
from .models import ModifiedResNet, ViT, TextEncoder
@register
......
......@@ -84,7 +84,7 @@ class Bottleneck(nn.Layer):
return out
class AttentionPool2D(nn.Module):
class AttentionPool2D(nn.Layer):
def __init__(self, spacial_dim, embed_dim, num_heads, output_dim):
super().__init__()
# TODO: need check whether it is consistent with torch or not
......@@ -151,10 +151,9 @@ class ResidualAttentionBlock(nn.Layer):
self.attn = MultiHeadAttention(d_model, n_head)
self.ln_1 = LayerNorm(d_model)
self.mlp = nn.Sequential(
OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), (
"gelu", QuickGELU()), ("c_proj", nn.Linear(d_model * 4, d_model)
)]))
self.mlp = nn.Sequential(("c_fc", nn.Linear(d_model, d_model * 4)),
("gelu", QuickGELU()),
("c_proj", nn.Linear(d_model * 4, d_model)))
self.ln_2 = LayerNorm(d_model)
self.attn_mask = attn_mask
self.droplayer_p = droplayer_p
......@@ -192,6 +191,7 @@ class Transformer(nn.Layer):
super().__init__()
self.width = width
self.layers = layers
self.stochastic_droplayer_rate = stochastic_droplayer_rate
blocks = []
for i in range(self.layers):
droplayer_p = (i / max(self.layers - 1,
......
......@@ -32,7 +32,7 @@ from ppdet.core.workspace import register
from .layers import *
__all__ = ['ModifiedResNet', 'VisionTransformer', 'TextEncoder']
__all__ = ['ModifiedResNet', 'ViT', 'TextEncoder']
@register
......@@ -105,7 +105,7 @@ class ModifiedResNet(nn.Layer):
@register
class VisionTransformer(nn.Layer):
class ViT(nn.Layer):
def __init__(self,
input_resolution,
patch_size,
......@@ -115,6 +115,7 @@ class VisionTransformer(nn.Layer):
output_dim=None,
stochastic_droplayer_rate=0.0):
super().__init__()
self.width = width
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2D(
......@@ -122,7 +123,7 @@ class VisionTransformer(nn.Layer):
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)
bias_attr=False)
scale = width**-0.5
self.class_embedding = self.create_parameter(
shape=[width], attr=ParamAttr(initializer=Normal(std=scale)))
......@@ -157,9 +158,14 @@ class VisionTransformer(nn.Layer):
@register
class TextEncoder(nn.Layer):
def __init__(self, context_length, vocab_size, transformer_width,
transformer_heads, transformer_layers,
stochastic_droplayer_rate):
def __init__(self,
embed_dim,
context_length,
vocab_size,
transformer_width,
transformer_heads,
transformer_layers,
stochastic_droplayer_rate=0.0):
super().__init__()
self.context_length = context_length
......@@ -178,8 +184,6 @@ class TextEncoder(nn.Layer):
self.ln_final = LayerNorm(transformer_width)
self.text_projection = nn.Linear(
transformer_width, embed_dim, bias_attr=False)
self.logit_scale = self.create_parameter(
shape=[], attr=ParamAttr(initializer=Constant(np.log(1. / 0.07))))
def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
......
......@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .owl_vit_head import *
\ No newline at end of file
......@@ -22,6 +22,7 @@ import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register
from ..utils import compute_box_bias
......@@ -46,12 +47,13 @@ class PredictorMLP(nn.Layer):
in_dim,
out_dim,
num_layers,
mlp_dim,
hidden_activation,
mlp_dim=None,
hidden_activation='gelu',
out_activation=None):
super().__init__()
layers = []
mlp_dim = in_dim if mlp_dim is None else mlp_dim
for _ in range(num_layers - 1):
layers.append(nn.Linear(in_dim, mlp_dim))
in_dim = mlp_dim
......@@ -138,7 +140,6 @@ class OWLViTHead(nn.Layer):
self.class_head = class_head
self.bbox_head = bbox_head
self.box_bias = box_bias
self.matcher = matcher
self.loss = loss
def box_predictor(self, image_features, feature_map):
......
......@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .owl_vit_loss import *
\ No newline at end of file
......@@ -32,7 +32,7 @@ class OWLViTLoss(nn.Layer):
__inject__ = ['HungarianMatcher']
def __init__(self,
num_classes,
num_classes=80,
matcher='HungarianMatcher',
normalization='per_example',
loss_coeff=None,
......
......@@ -21,6 +21,7 @@ from __future__ import print_function
import gzip
import html
import os
import functools
from functools import lru_cache
import ftfy
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册