提交 f2212a71 编写于 作者: L LielinJiang

Merge branch 'master' of https://github.com/PaddlePaddle/hapi into lenet

...@@ -16,14 +16,60 @@ ...@@ -16,14 +16,60 @@
import paddle.fluid as fluid import paddle.fluid as fluid
from hapi.metrics import Accuracy from hapi.metrics import Accuracy
from hapi.configure import Config from hapi.configure import Config
from hapi.text.bert import BertEncoder
from paddle.fluid.dygraph import Linear, Layer
from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
from cls import ClsModelLayer
import hapi.text.tokenizer.tokenization as tokenization import hapi.text.tokenizer.tokenization as tokenization
from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
def train(): class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
pred = self.cls_fc(cls_feats)
return pred
def main():
config = Config(yaml_file="./bert.yaml") config = Config(yaml_file="./bert.yaml")
config.build() config.build()
...@@ -35,8 +81,6 @@ def train(): ...@@ -35,8 +81,6 @@ def train():
bert_config = BertConfig(config.bert_config_path) bert_config = BertConfig(config.bert_config_path)
bert_config.print_config() bert_config.print_config()
trainer_count = fluid.dygraph.parallel.Env().nranks
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization.FullTokenizer(
vocab_file=config.vocab_path, do_lower_case=config.do_lower_case) vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
...@@ -52,14 +96,24 @@ def train(): ...@@ -52,14 +96,24 @@ def train():
return BertInputExample( return BertInputExample(
uid=uid, text_a=text_a, text_b=text_b, label=label) uid=uid, text_a=text_a, text_b=text_b, label=label)
bert_dataloader = BertDataLoader( train_dataloader = BertDataLoader(
"./data/glue_data/MNLI/train.tsv", "./data/glue_data/MNLI/train.tsv",
tokenizer, ["contradiction", "entailment", "neutral"], tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=64, max_seq_length=config.max_seq_len,
batch_size=32, batch_size=config.batch_size,
line_processor=mnli_line_processor) line_processor=mnli_line_processor)
num_train_examples = len(bert_dataloader.dataset) test_dataloader = BertDataLoader(
"./data/glue_data/MNLI/dev_matched.tsv",
tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=config.max_seq_len,
batch_size=config.batch_size,
line_processor=mnli_line_processor,
shuffle=False,
phase="predict")
trainer_count = fluid.dygraph.parallel.Env().nranks
num_train_examples = len(train_dataloader.dataset)
max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
warmup_steps = int(max_train_steps * config.warmup_proportion) warmup_steps = int(max_train_steps * config.warmup_proportion)
...@@ -82,7 +136,6 @@ def train(): ...@@ -82,7 +136,6 @@ def train():
config, config,
bert_config, bert_config,
len(["contradiction", "entailment", "neutral"]), len(["contradiction", "entailment", "neutral"]),
is_training=True,
return_pooled_out=True) return_pooled_out=True)
optimizer = Optimizer( optimizer = Optimizer(
...@@ -106,10 +159,15 @@ def train(): ...@@ -106,10 +159,15 @@ def train():
cls_model.bert_layer.init_parameters( cls_model.bert_layer.init_parameters(
config.init_pretraining_params, verbose=config.verbose) config.init_pretraining_params, verbose=config.verbose)
cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch) # do train
cls_model.fit(train_data=train_dataloader.dataloader,
epochs=config.epoch,
save_dir=config.checkpoints)
return cls_model # do eval
cls_model.evaluate(
eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
if __name__ == '__main__': if __name__ == '__main__':
cls_model = train() main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from hapi.text.bert import BertEncoder
from hapi.model import Model
class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
is_training=True,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = self.cls_fc(cls_feats)
return logits
...@@ -18,7 +18,7 @@ batch_size: 32 ...@@ -18,7 +18,7 @@ batch_size: 32
in_tokens: False in_tokens: False
do_lower_case: True do_lower_case: True
random_seed: 5512 random_seed: 5512
use_cuda: False use_cuda: True
shuffle: True shuffle: True
do_train: True do_train: True
do_test: True do_test: True
......
...@@ -16,14 +16,60 @@ ...@@ -16,14 +16,60 @@
import paddle.fluid as fluid import paddle.fluid as fluid
from hapi.metrics import Accuracy from hapi.metrics import Accuracy
from hapi.configure import Config from hapi.configure import Config
from hapi.text.bert import BertEncoder
from paddle.fluid.dygraph import Linear, Layer
from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
from cls import ClsModelLayer
import hapi.text.tokenizer.tokenization as tokenization import hapi.text.tokenizer.tokenization as tokenization
from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
def train(): class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
pred = self.cls_fc(cls_feats)
return pred
def main():
config = Config(yaml_file="./bert.yaml") config = Config(yaml_file="./bert.yaml")
config.build() config.build()
...@@ -35,8 +81,6 @@ def train(): ...@@ -35,8 +81,6 @@ def train():
bert_config = BertConfig(config.bert_config_path) bert_config = BertConfig(config.bert_config_path)
bert_config.print_config() bert_config.print_config()
trainer_count = fluid.dygraph.parallel.Env().nranks
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization.FullTokenizer(
vocab_file=config.vocab_path, do_lower_case=config.do_lower_case) vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
...@@ -52,15 +96,26 @@ def train(): ...@@ -52,15 +96,26 @@ def train():
return BertInputExample( return BertInputExample(
uid=uid, text_a=text_a, text_b=text_b, label=label) uid=uid, text_a=text_a, text_b=text_b, label=label)
bert_dataloader = BertDataLoader( train_dataloader = BertDataLoader(
"./data/glue_data/MNLI/train.tsv", "./data/glue_data/MNLI/train.tsv",
tokenizer, ["contradiction", "entailment", "neutral"], tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=64, max_seq_length=config.max_seq_len,
batch_size=32, batch_size=config.batch_size,
line_processor=mnli_line_processor, line_processor=mnli_line_processor,
mode="leveldb") mode="leveldb",
phase="train")
num_train_examples = len(bert_dataloader.dataset) test_dataloader = BertDataLoader(
"./data/glue_data/MNLI/dev_matched.tsv",
tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=config.max_seq_len,
batch_size=config.batch_size,
line_processor=mnli_line_processor,
shuffle=False,
phase="predict")
trainer_count = fluid.dygraph.parallel.Env().nranks
num_train_examples = len(train_dataloader.dataset)
max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
warmup_steps = int(max_train_steps * config.warmup_proportion) warmup_steps = int(max_train_steps * config.warmup_proportion)
...@@ -83,7 +138,6 @@ def train(): ...@@ -83,7 +138,6 @@ def train():
config, config,
bert_config, bert_config,
len(["contradiction", "entailment", "neutral"]), len(["contradiction", "entailment", "neutral"]),
is_training=True,
return_pooled_out=True) return_pooled_out=True)
optimizer = Optimizer( optimizer = Optimizer(
...@@ -107,10 +161,15 @@ def train(): ...@@ -107,10 +161,15 @@ def train():
cls_model.bert_layer.init_parameters( cls_model.bert_layer.init_parameters(
config.init_pretraining_params, verbose=config.verbose) config.init_pretraining_params, verbose=config.verbose)
cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch) # do train
cls_model.fit(train_data=train_dataloader.dataloader,
epochs=config.epoch,
save_dir=config.checkpoints)
return cls_model # do eval
cls_model.evaluate(
eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
if __name__ == '__main__': if __name__ == '__main__':
cls_model = train() main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from hapi.text.bert import BertEncoder
from hapi.model import Model
class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
is_training=True,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = self.cls_fc(cls_feats)
return logits
grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
2020-04-13 13:08:30,568-WARNING: use_shared_memory can only be used in multi-process mode(num_workers > 0), set use_shared_memory as False
W0413 13:08:31.584532 119379 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W0413 13:08:31.589192 119379 device_context.cc:245] device: 0, cuDNN Version: 7.5.
----------------------------------------------------------------------
bert_config_path: ./data/pretrained_models/uncased_L-12_H-768_A-12//bert_config.json
init_checkpoint: None
init_pretraining_params: ./data/pretrained_models/uncased_L-12_H-768_A-12//dygraph_params/
checkpoints: ./data/saved_model/mnli_models
epoch: 3
learning_rate: 5e-05
lr_scheduler: linear_warmup_decay
weight_decay: 0.01
warmup_proportion: 0.1
save_steps: 1000
validation_steps: 100
loss_scaling: 1.0
skip_steps: 10
data_dir: ./data/glue_data/MNLI/
vocab_path: ./data/pretrained_models/uncased_L-12_H-768_A-12//vocab.txt
max_seq_len: 128
batch_size: 64
in_tokens: False
do_lower_case: True
random_seed: 5512
use_cuda: True
shuffle: True
do_train: True
do_test: True
use_data_parallel: False
verbose: False
----------------------------------------------------------------------
attention_probs_dropout_prob: 0.1
hidden_act: gelu
hidden_dropout_prob: 0.1
hidden_size: 768
initializer_range: 0.02
intermediate_size: 3072
max_position_embeddings: 512
num_attention_heads: 12
num_hidden_layers: 12
type_vocab_size: 2
vocab_size: 30522
------------------------------------------------
Trainer count: 1
Num train examples: 392703
Max train steps: 18407
Num warmup steps: 1840
Epoch 1/3
step 10/12272 - loss: 1.1000 - acc_top1: 0.3531 - acc_top2: 0.6813 - 1s/step
step 20/12272 - loss: 1.1878 - acc_top1: 0.3578 - acc_top2: 0.6875 - 1s/step
step 30/12272 - loss: 1.0812 - acc_top1: 0.3708 - acc_top2: 0.6948 - 1s/step
step 40/12272 - loss: 1.1244 - acc_top1: 0.3773 - acc_top2: 0.6992 - 1s/step
step 50/12272 - loss: 1.1202 - acc_top1: 0.3756 - acc_top2: 0.7006 - 1s/step
step 60/12272 - loss: 1.1291 - acc_top1: 0.3703 - acc_top2: 0.6990 - 1s/step
step 70/12272 - loss: 1.0991 - acc_top1: 0.3634 - acc_top2: 0.6946 - 1s/step
step 80/12272 - loss: 1.0988 - acc_top1: 0.3602 - acc_top2: 0.6914 - 1s/step
step 90/12272 - loss: 1.0718 - acc_top1: 0.3646 - acc_top2: 0.6889 - 1s/step
step 100/12272 - loss: 1.0949 - acc_top1: 0.3638 - acc_top2: 0.6878 - 1s/step
step 110/12272 - loss: 1.1120 - acc_top1: 0.3608 - acc_top2: 0.6895 - 1s/step
step 120/12272 - loss: 1.1105 - acc_top1: 0.3622 - acc_top2: 0.6922 - 1s/step
step 130/12272 - loss: 1.0958 - acc_top1: 0.3623 - acc_top2: 0.6940 - 1s/step
step 140/12272 - loss: 1.0995 - acc_top1: 0.3636 - acc_top2: 0.6926 - 1s/step
step 150/12272 - loss: 1.1272 - acc_top1: 0.3671 - acc_top2: 0.6950 - 1s/step
step 160/12272 - loss: 1.0850 - acc_top1: 0.3697 - acc_top2: 0.6975 - 1s/step
step 170/12272 - loss: 1.0607 - acc_top1: 0.3691 - acc_top2: 0.6991 - 1s/step
step 180/12272 - loss: 1.0623 - acc_top1: 0.3707 - acc_top2: 0.6991 - 1s/step
step 190/12272 - loss: 1.1092 - acc_top1: 0.3697 - acc_top2: 0.6997 - 1s/step
step 200/12272 - loss: 1.1046 - acc_top1: 0.3713 - acc_top2: 0.7030 - 1s/step
step 210/12272 - loss: 1.0945 - acc_top1: 0.3720 - acc_top2: 0.7043 - 1s/step
step 220/12272 - loss: 1.0935 - acc_top1: 0.3719 - acc_top2: 0.7051 - 1s/step
step 230/12272 - loss: 1.1567 - acc_top1: 0.3742 - acc_top2: 0.7048 - 1s/step
step 240/12272 - loss: 1.0745 - acc_top1: 0.3766 - acc_top2: 0.7081 - 1s/step
step 250/12272 - loss: 1.0664 - acc_top1: 0.3756 - acc_top2: 0.7090 - 1s/step
step 260/12272 - loss: 1.0770 - acc_top1: 0.3751 - acc_top2: 0.7085 - 1s/step
step 270/12272 - loss: 1.1008 - acc_top1: 0.3730 - acc_top2: 0.7088 - 1s/step
step 280/12272 - loss: 1.0850 - acc_top1: 0.3737 - acc_top2: 0.7098 - 1s/step
step 290/12272 - loss: 1.0759 - acc_top1: 0.3747 - acc_top2: 0.7100 - 1s/step
step 300/12272 - loss: 1.0352 - acc_top1: 0.3758 - acc_top2: 0.7108 - 1s/step
step 310/12272 - loss: 1.0224 - acc_top1: 0.3786 - acc_top2: 0.7127 - 1s/step
step 320/12272 - loss: 1.0919 - acc_top1: 0.3800 - acc_top2: 0.7137 - 1s/step
step 330/12272 - loss: 1.0884 - acc_top1: 0.3825 - acc_top2: 0.7145 - 1s/step
step 340/12272 - loss: 1.1380 - acc_top1: 0.3849 - acc_top2: 0.7157 - 1s/step
step 350/12272 - loss: 0.9523 - acc_top1: 0.3890 - acc_top2: 0.7176 - 1s/step
step 360/12272 - loss: 0.9963 - acc_top1: 0.3922 - acc_top2: 0.7191 - 1s/step
step 370/12272 - loss: 1.1187 - acc_top1: 0.3955 - acc_top2: 0.7205 - 1s/step
step 380/12272 - loss: 0.9634 - acc_top1: 0.3988 - acc_top2: 0.7229 - 1s/step
step 390/12272 - loss: 0.9944 - acc_top1: 0.4017 - acc_top2: 0.7254 - 1s/step
step 400/12272 - loss: 1.1071 - acc_top1: 0.4044 - acc_top2: 0.7272 - 1s/step
step 410/12272 - loss: 0.9307 - acc_top1: 0.4070 - acc_top2: 0.7293 - 1s/step
step 420/12272 - loss: 1.1307 - acc_top1: 0.4087 - acc_top2: 0.7315 - 1s/step
step 430/12272 - loss: 0.9936 - acc_top1: 0.4110 - acc_top2: 0.7334 - 1s/step
step 440/12272 - loss: 0.9791 - acc_top1: 0.4139 - acc_top2: 0.7357 - 1s/step
step 450/12272 - loss: 1.0112 - acc_top1: 0.4147 - acc_top2: 0.7372 - 1s/step
step 460/12272 - loss: 0.8554 - acc_top1: 0.4179 - acc_top2: 0.7395 - 1s/step
step 470/12272 - loss: 0.9411 - acc_top1: 0.4198 - acc_top2: 0.7406 - 1s/step
step 480/12272 - loss: 0.8481 - acc_top1: 0.4231 - acc_top2: 0.7424 - 1s/step
step 490/12272 - loss: 1.0338 - acc_top1: 0.4261 - acc_top2: 0.7441 - 1s/step
step 500/12272 - loss: 0.9651 - acc_top1: 0.4281 - acc_top2: 0.7459 - 1s/step
step 510/12272 - loss: 0.8091 - acc_top1: 0.4306 - acc_top2: 0.7479 - 1s/step
step 520/12272 - loss: 1.0528 - acc_top1: 0.4325 - acc_top2: 0.7489 - 1s/step
step 530/12272 - loss: 0.9898 - acc_top1: 0.4338 - acc_top2: 0.7500 - 1s/step
step 540/12272 - loss: 0.7900 - acc_top1: 0.4364 - acc_top2: 0.7519 - 1s/step
step 550/12272 - loss: 0.9055 - acc_top1: 0.4389 - acc_top2: 0.7534 - 1s/step
step 560/12272 - loss: 1.0092 - acc_top1: 0.4410 - acc_top2: 0.7549 - 1s/step
step 570/12272 - loss: 0.7068 - acc_top1: 0.4441 - acc_top2: 0.7570 - 1s/step
step 580/12272 - loss: 0.9695 - acc_top1: 0.4455 - acc_top2: 0.7581 - 1s/step
step 590/12272 - loss: 0.8640 - acc_top1: 0.4487 - acc_top2: 0.7600 - 1s/step
step 600/12272 - loss: 0.9068 - acc_top1: 0.4514 - acc_top2: 0.7618 - 1s/step
step 610/12272 - loss: 0.9023 - acc_top1: 0.4524 - acc_top2: 0.7627 - 1s/step
step 620/12272 - loss: 0.7377 - acc_top1: 0.4552 - acc_top2: 0.7640 - 1s/step
step 630/12272 - loss: 0.8900 - acc_top1: 0.4574 - acc_top2: 0.7659 - 1s/step
step 640/12272 - loss: 0.8902 - acc_top1: 0.4590 - acc_top2: 0.7669 - 1s/step
step 650/12272 - loss: 0.9069 - acc_top1: 0.4608 - acc_top2: 0.7686 - 1s/step
step 660/12272 - loss: 0.9630 - acc_top1: 0.4631 - acc_top2: 0.7699 - 1s/step
step 670/12272 - loss: 0.9005 - acc_top1: 0.4652 - acc_top2: 0.7712 - 1s/step
step 680/12272 - loss: 1.0725 - acc_top1: 0.4670 - acc_top2: 0.7725 - 1s/step
step 690/12272 - loss: 0.8322 - acc_top1: 0.4689 - acc_top2: 0.7739 - 1s/step
step 700/12272 - loss: 0.9874 - acc_top1: 0.4714 - acc_top2: 0.7753 - 1s/step
step 710/12272 - loss: 0.7915 - acc_top1: 0.4728 - acc_top2: 0.7765 - 1s/step
step 720/12272 - loss: 0.7174 - acc_top1: 0.4746 - acc_top2: 0.7777 - 1s/step
step 730/12272 - loss: 0.7635 - acc_top1: 0.4770 - acc_top2: 0.7793 - 1s/step
step 740/12272 - loss: 0.9180 - acc_top1: 0.4793 - acc_top2: 0.7804 - 1s/step
step 750/12272 - loss: 0.8424 - acc_top1: 0.4817 - acc_top2: 0.7815 - 1s/step
step 760/12272 - loss: 0.9357 - acc_top1: 0.4837 - acc_top2: 0.7829 - 1s/step
step 770/12272 - loss: 0.7643 - acc_top1: 0.4858 - acc_top2: 0.7839 - 1s/step
step 780/12272 - loss: 0.8910 - acc_top1: 0.4868 - acc_top2: 0.7849 - 1s/step
step 790/12272 - loss: 0.8781 - acc_top1: 0.4888 - acc_top2: 0.7862 - 1s/step
step 800/12272 - loss: 0.8005 - acc_top1: 0.4907 - acc_top2: 0.7877 - 1s/step
step 810/12272 - loss: 0.6740 - acc_top1: 0.4929 - acc_top2: 0.7889 - 1s/step
step 820/12272 - loss: 0.7026 - acc_top1: 0.4947 - acc_top2: 0.7898 - 1s/step
step 830/12272 - loss: 0.8666 - acc_top1: 0.4964 - acc_top2: 0.7908 - 1s/step
step 840/12272 - loss: 0.6296 - acc_top1: 0.4983 - acc_top2: 0.7920 - 1s/step
step 850/12272 - loss: 0.7907 - acc_top1: 0.4992 - acc_top2: 0.7930 - 1s/step
step 860/12272 - loss: 0.7292 - acc_top1: 0.5007 - acc_top2: 0.7935 - 1s/step
step 870/12272 - loss: 0.7498 - acc_top1: 0.5026 - acc_top2: 0.7944 - 1s/step
step 880/12272 - loss: 0.9928 - acc_top1: 0.5040 - acc_top2: 0.7953 - 1s/step
step 890/12272 - loss: 1.0025 - acc_top1: 0.5056 - acc_top2: 0.7962 - 1s/step
step 900/12272 - loss: 0.7810 - acc_top1: 0.5071 - acc_top2: 0.7969 - 1s/step
step 910/12272 - loss: 0.6114 - acc_top1: 0.5090 - acc_top2: 0.7978 - 1s/step
step 920/12272 - loss: 0.7780 - acc_top1: 0.5105 - acc_top2: 0.7988 - 1s/step
step 930/12272 - loss: 0.9457 - acc_top1: 0.5116 - acc_top2: 0.7995 - 1s/step
step 940/12272 - loss: 0.7907 - acc_top1: 0.5135 - acc_top2: 0.8006 - 1s/step
step 950/12272 - loss: 0.5520 - acc_top1: 0.5153 - acc_top2: 0.8013 - 1s/step
step 960/12272 - loss: 0.8251 - acc_top1: 0.5168 - acc_top2: 0.8022 - 1s/step
step 970/12272 - loss: 0.8482 - acc_top1: 0.5179 - acc_top2: 0.8031 - 1s/step
step 980/12272 - loss: 0.8010 - acc_top1: 0.5196 - acc_top2: 0.8038 - 1s/step
step 990/12272 - loss: 0.8326 - acc_top1: 0.5207 - acc_top2: 0.8047 - 1s/step
step 1000/12272 - loss: 0.6979 - acc_top1: 0.5222 - acc_top2: 0.8057 - 1s/step
step 1010/12272 - loss: 0.7506 - acc_top1: 0.5234 - acc_top2: 0.8065 - 1s/step
step 1020/12272 - loss: 0.8457 - acc_top1: 0.5248 - acc_top2: 0.8073 - 1s/step
step 1030/12272 - loss: 0.8698 - acc_top1: 0.5263 - acc_top2: 0.8082 - 1s/step
step 1040/12272 - loss: 0.7016 - acc_top1: 0.5279 - acc_top2: 0.8091 - 1s/step
step 1050/12272 - loss: 0.7766 - acc_top1: 0.5290 - acc_top2: 0.8099 - 1s/step
step 1060/12272 - loss: 0.7994 - acc_top1: 0.5300 - acc_top2: 0.8105 - 1s/step
step 1070/12272 - loss: 0.7053 - acc_top1: 0.5317 - acc_top2: 0.8115 - 1s/step
step 1080/12272 - loss: 0.9085 - acc_top1: 0.5330 - acc_top2: 0.8125 - 1s/step
step 1090/12272 - loss: 0.7556 - acc_top1: 0.5342 - acc_top2: 0.8134 - 1s/step
step 1100/12272 - loss: 0.9364 - acc_top1: 0.5355 - acc_top2: 0.8141 - 1s/step
step 1110/12272 - loss: 0.9403 - acc_top1: 0.5367 - acc_top2: 0.8148 - 1s/step
step 1120/12272 - loss: 0.8228 - acc_top1: 0.5375 - acc_top2: 0.8152 - 1s/step
step 1130/12272 - loss: 0.6802 - acc_top1: 0.5388 - acc_top2: 0.8160 - 1s/step
step 1140/12272 - loss: 0.8222 - acc_top1: 0.5397 - acc_top2: 0.8167 - 1s/step
step 1150/12272 - loss: 0.9321 - acc_top1: 0.5407 - acc_top2: 0.8172 - 1s/step
step 1160/12272 - loss: 0.7478 - acc_top1: 0.5417 - acc_top2: 0.8181 - 1s/step
step 1170/12272 - loss: 0.7976 - acc_top1: 0.5430 - acc_top2: 0.8188 - 1s/step
step 1180/12272 - loss: 0.7386 - acc_top1: 0.5441 - acc_top2: 0.8192 - 1s/step
step 1190/12272 - loss: 0.6448 - acc_top1: 0.5450 - acc_top2: 0.8200 - 1s/step
step 1200/12272 - loss: 0.7441 - acc_top1: 0.5463 - acc_top2: 0.8206 - 1s/step
step 1210/12272 - loss: 0.8171 - acc_top1: 0.5476 - acc_top2: 0.8213 - 1s/step
step 1220/12272 - loss: 0.7480 - acc_top1: 0.5487 - acc_top2: 0.8219 - 1s/step
step 1230/12272 - loss: 0.6363 - acc_top1: 0.5497 - acc_top2: 0.8225 - 1s/step
step 1240/12272 - loss: 0.6630 - acc_top1: 0.5507 - acc_top2: 0.8231 - 1s/step
step 1250/12272 - loss: 0.8668 - acc_top1: 0.5517 - acc_top2: 0.8237 - 1s/step
step 1260/12272 - loss: 0.6057 - acc_top1: 0.5527 - acc_top2: 0.8243 - 1s/step
step 1270/12272 - loss: 0.8432 - acc_top1: 0.5538 - acc_top2: 0.8248 - 1s/step
step 1280/12272 - loss: 0.8447 - acc_top1: 0.5546 - acc_top2: 0.8253 - 1s/step
step 1290/12272 - loss: 0.6928 - acc_top1: 0.5556 - acc_top2: 0.8261 - 1s/step
step 1300/12272 - loss: 0.7872 - acc_top1: 0.5567 - acc_top2: 0.8266 - 1s/step
step 1310/12272 - loss: 0.7968 - acc_top1: 0.5570 - acc_top2: 0.8269 - 1s/step
step 1320/12272 - loss: 0.8059 - acc_top1: 0.5580 - acc_top2: 0.8275 - 1s/step
step 1330/12272 - loss: 0.8603 - acc_top1: 0.5587 - acc_top2: 0.8278 - 1s/step
step 1340/12272 - loss: 0.7872 - acc_top1: 0.5599 - acc_top2: 0.8285 - 1s/step
step 1350/12272 - loss: 0.7037 - acc_top1: 0.5609 - acc_top2: 0.8290 - 1s/step
step 1360/12272 - loss: 0.8268 - acc_top1: 0.5618 - acc_top2: 0.8297 - 1s/step
step 1370/12272 - loss: 0.5962 - acc_top1: 0.5627 - acc_top2: 0.8303 - 1s/step
step 1380/12272 - loss: 0.7712 - acc_top1: 0.5638 - acc_top2: 0.8310 - 1s/step
step 1390/12272 - loss: 0.5770 - acc_top1: 0.5650 - acc_top2: 0.8315 - 1s/step
step 1400/12272 - loss: 0.7174 - acc_top1: 0.5656 - acc_top2: 0.8319 - 1s/step
step 1410/12272 - loss: 0.6224 - acc_top1: 0.5660 - acc_top2: 0.8323 - 1s/step
step 1420/12272 - loss: 0.6782 - acc_top1: 0.5671 - acc_top2: 0.8328 - 1s/step
step 1430/12272 - loss: 0.4087 - acc_top1: 0.5682 - acc_top2: 0.8335 - 1s/step
step 1440/12272 - loss: 0.7534 - acc_top1: 0.5692 - acc_top2: 0.8342 - 1s/step
step 1450/12272 - loss: 0.6446 - acc_top1: 0.5702 - acc_top2: 0.8345 - 1s/step
step 1460/12272 - loss: 0.6606 - acc_top1: 0.5712 - acc_top2: 0.8351 - 1s/step
step 1470/12272 - loss: 0.7308 - acc_top1: 0.5723 - acc_top2: 0.8357 - 1s/step
step 1480/12272 - loss: 0.9016 - acc_top1: 0.5727 - acc_top2: 0.8359 - 1s/step
step 1490/12272 - loss: 0.8445 - acc_top1: 0.5730 - acc_top2: 0.8362 - 1s/step
step 1500/12272 - loss: 0.8217 - acc_top1: 0.5737 - acc_top2: 0.8367 - 1s/step
step 1510/12272 - loss: 0.8413 - acc_top1: 0.5747 - acc_top2: 0.8370 - 1s/step
step 1520/12272 - loss: 0.4643 - acc_top1: 0.5757 - acc_top2: 0.8376 - 1s/step
step 1530/12272 - loss: 0.9351 - acc_top1: 0.5764 - acc_top2: 0.8381 - 1s/step
step 1540/12272 - loss: 0.7856 - acc_top1: 0.5773 - acc_top2: 0.8386 - 1s/step
step 1550/12272 - loss: 0.5921 - acc_top1: 0.5780 - acc_top2: 0.8390 - 1s/step
step 1560/12272 - loss: 0.4460 - acc_top1: 0.5788 - acc_top2: 0.8395 - 1s/step
step 1570/12272 - loss: 0.6814 - acc_top1: 0.5793 - acc_top2: 0.8401 - 1s/step
step 1580/12272 - loss: 0.4115 - acc_top1: 0.5805 - acc_top2: 0.8407 - 1s/step
step 1590/12272 - loss: 0.9326 - acc_top1: 0.5810 - acc_top2: 0.8410 - 1s/step
step 1600/12272 - loss: 0.6989 - acc_top1: 0.5818 - acc_top2: 0.8413 - 1s/step
step 1610/12272 - loss: 0.5238 - acc_top1: 0.5826 - acc_top2: 0.8418 - 1s/step
step 1620/12272 - loss: 0.5827 - acc_top1: 0.5832 - acc_top2: 0.8422 - 1s/step
step 1630/12272 - loss: 0.7703 - acc_top1: 0.5838 - acc_top2: 0.8425 - 1s/step
step 1640/12272 - loss: 0.7926 - acc_top1: 0.5844 - acc_top2: 0.8428 - 1s/step
step 1650/12272 - loss: 0.7143 - acc_top1: 0.5851 - acc_top2: 0.8434 - 1s/step
step 1660/12272 - loss: 0.6240 - acc_top1: 0.5858 - acc_top2: 0.8438 - 1s/step
step 1670/12272 - loss: 0.7869 - acc_top1: 0.5862 - acc_top2: 0.8440 - 1s/step
step 1680/12272 - loss: 0.6485 - acc_top1: 0.5868 - acc_top2: 0.8444 - 1s/step
step 1690/12272 - loss: 0.7539 - acc_top1: 0.5876 - acc_top2: 0.8450 - 1s/step
step 1700/12272 - loss: 0.6173 - acc_top1: 0.5882 - acc_top2: 0.8454 - 1s/step
step 1710/12272 - loss: 0.8056 - acc_top1: 0.5890 - acc_top2: 0.8458 - 1s/step
step 1720/12272 - loss: 0.7035 - acc_top1: 0.5898 - acc_top2: 0.8463 - 1s/step
step 1730/12272 - loss: 0.5892 - acc_top1: 0.5908 - acc_top2: 0.8468 - 1s/step
step 1740/12272 - loss: 0.7755 - acc_top1: 0.5915 - acc_top2: 0.8472 - 1s/step
step 1750/12272 - loss: 0.6911 - acc_top1: 0.5920 - acc_top2: 0.8474 - 1s/step
step 1760/12272 - loss: 0.6309 - acc_top1: 0.5926 - acc_top2: 0.8477 - 1s/step
step 1770/12272 - loss: 0.7506 - acc_top1: 0.5932 - acc_top2: 0.8480 - 1s/step
step 1780/12272 - loss: 0.8711 - acc_top1: 0.5939 - acc_top2: 0.8482 - 1s/step
step 1790/12272 - loss: 0.9146 - acc_top1: 0.5945 - acc_top2: 0.8484 - 1s/step
step 1800/12272 - loss: 0.6208 - acc_top1: 0.5952 - acc_top2: 0.8487 - 1s/step
step 1810/12272 - loss: 0.8506 - acc_top1: 0.5959 - acc_top2: 0.8490 - 1s/step
step 1820/12272 - loss: 0.8330 - acc_top1: 0.5965 - acc_top2: 0.8494 - 1s/step
step 1830/12272 - loss: 0.8315 - acc_top1: 0.5970 - acc_top2: 0.8497 - 1s/step
step 1840/12272 - loss: 0.6227 - acc_top1: 0.5977 - acc_top2: 0.8501 - 1s/step
step 1850/12272 - loss: 0.5972 - acc_top1: 0.5985 - acc_top2: 0.8506 - 1s/step
step 1860/12272 - loss: 0.6309 - acc_top1: 0.5992 - acc_top2: 0.8510 - 1s/step
step 1870/12272 - loss: 0.8707 - acc_top1: 0.5995 - acc_top2: 0.8512 - 1s/step
step 1880/12272 - loss: 0.6419 - acc_top1: 0.6004 - acc_top2: 0.8516 - 1s/step
step 1890/12272 - loss: 0.6015 - acc_top1: 0.6010 - acc_top2: 0.8521 - 1s/step
step 1900/12272 - loss: 0.6000 - acc_top1: 0.6015 - acc_top2: 0.8524 - 1s/step
step 1910/12272 - loss: 0.7010 - acc_top1: 0.6020 - acc_top2: 0.8527 - 1s/step
step 1920/12272 - loss: 0.8539 - acc_top1: 0.6026 - acc_top2: 0.8530 - 1s/step
step 1930/12272 - loss: 0.8381 - acc_top1: 0.6031 - acc_top2: 0.8533 - 1s/step
step 1940/12272 - loss: 0.5921 - acc_top1: 0.6039 - acc_top2: 0.8537 - 1s/step
step 1950/12272 - loss: 0.4974 - acc_top1: 0.6047 - acc_top2: 0.8541 - 1s/step
step 1960/12272 - loss: 0.8269 - acc_top1: 0.6052 - acc_top2: 0.8544 - 1s/step
step 1970/12272 - loss: 0.6157 - acc_top1: 0.6058 - acc_top2: 0.8548 - 1s/step
step 1980/12272 - loss: 1.0949 - acc_top1: 0.6064 - acc_top2: 0.8552 - 1s/step
step 1990/12272 - loss: 0.6442 - acc_top1: 0.6070 - acc_top2: 0.8555 - 1s/step
step 2000/12272 - loss: 0.8747 - acc_top1: 0.6073 - acc_top2: 0.8558 - 1s/step
step 2010/12272 - loss: 0.8101 - acc_top1: 0.6078 - acc_top2: 0.8560 - 1s/step
step 2020/12272 - loss: 0.8623 - acc_top1: 0.6082 - acc_top2: 0.8562 - 1s/step
step 2030/12272 - loss: 0.6664 - acc_top1: 0.6089 - acc_top2: 0.8567 - 1s/step
step 2040/12272 - loss: 0.7616 - acc_top1: 0.6092 - acc_top2: 0.8567 - 1s/step
step 2050/12272 - loss: 0.7282 - acc_top1: 0.6095 - acc_top2: 0.8570 - 1s/step
step 2060/12272 - loss: 0.6914 - acc_top1: 0.6099 - acc_top2: 0.8574 - 1s/step
step 2070/12272 - loss: 0.6129 - acc_top1: 0.6105 - acc_top2: 0.8577 - 1s/step
step 2080/12272 - loss: 0.5605 - acc_top1: 0.6111 - acc_top2: 0.8580 - 1s/step
step 2090/12272 - loss: 0.6432 - acc_top1: 0.6116 - acc_top2: 0.8582 - 1s/step
step 2100/12272 - loss: 0.6783 - acc_top1: 0.6121 - acc_top2: 0.8586 - 1s/step
step 2110/12272 - loss: 0.5949 - acc_top1: 0.6128 - acc_top2: 0.8589 - 1s/step
step 2120/12272 - loss: 0.7832 - acc_top1: 0.6134 - acc_top2: 0.8592 - 1s/step
step 2130/12272 - loss: 0.6633 - acc_top1: 0.6139 - acc_top2: 0.8594 - 1s/step
step 2140/12272 - loss: 0.8456 - acc_top1: 0.6143 - acc_top2: 0.8596 - 1s/step
step 2150/12272 - loss: 0.7133 - acc_top1: 0.6150 - acc_top2: 0.8599 - 1s/step
step 2160/12272 - loss: 0.4699 - acc_top1: 0.6155 - acc_top2: 0.8602 - 1s/step
step 2170/12272 - loss: 0.6013 - acc_top1: 0.6161 - acc_top2: 0.8605 - 1s/step
step 2180/12272 - loss: 0.5676 - acc_top1: 0.6165 - acc_top2: 0.8608 - 1s/step
step 2190/12272 - loss: 0.5850 - acc_top1: 0.6172 - acc_top2: 0.8611 - 1s/step
step 2200/12272 - loss: 0.6887 - acc_top1: 0.6177 - acc_top2: 0.8612 - 1s/step
step 2210/12272 - loss: 0.5706 - acc_top1: 0.6180 - acc_top2: 0.8614 - 1s/step
step 2220/12272 - loss: 0.8251 - acc_top1: 0.6184 - acc_top2: 0.8617 - 1s/step
step 2230/12272 - loss: 0.6532 - acc_top1: 0.6188 - acc_top2: 0.8620 - 1s/step
step 2240/12272 - loss: 0.5888 - acc_top1: 0.6194 - acc_top2: 0.8623 - 1s/step
step 2250/12272 - loss: 0.6360 - acc_top1: 0.6198 - acc_top2: 0.8625 - 1s/step
step 2260/12272 - loss: 1.0555 - acc_top1: 0.6202 - acc_top2: 0.8628 - 1s/step
step 2270/12272 - loss: 0.4848 - acc_top1: 0.6207 - acc_top2: 0.8629 - 1s/step
step 2280/12272 - loss: 0.7243 - acc_top1: 0.6212 - acc_top2: 0.8632 - 1s/step
step 2290/12272 - loss: 0.4358 - acc_top1: 0.6216 - acc_top2: 0.8635 - 1s/step
step 2300/12272 - loss: 0.5473 - acc_top1: 0.6221 - acc_top2: 0.8637 - 1s/step
step 2310/12272 - loss: 0.6440 - acc_top1: 0.6226 - acc_top2: 0.8640 - 1s/step
step 2320/12272 - loss: 0.5785 - acc_top1: 0.6233 - acc_top2: 0.8643 - 1s/step
step 2330/12272 - loss: 0.7199 - acc_top1: 0.6237 - acc_top2: 0.8646 - 1s/step
step 2340/12272 - loss: 0.5622 - acc_top1: 0.6241 - acc_top2: 0.8647 - 1s/step
step 2350/12272 - loss: 0.6742 - acc_top1: 0.6245 - acc_top2: 0.8650 - 1s/step
step 2360/12272 - loss: 0.8149 - acc_top1: 0.6249 - acc_top2: 0.8652 - 1s/step
step 2370/12272 - loss: 0.5900 - acc_top1: 0.6253 - acc_top2: 0.8654 - 1s/step
step 2380/12272 - loss: 0.8046 - acc_top1: 0.6256 - acc_top2: 0.8656 - 1s/step
step 2390/12272 - loss: 0.6097 - acc_top1: 0.6262 - acc_top2: 0.8659 - 1s/step
step 2400/12272 - loss: 0.5936 - acc_top1: 0.6266 - acc_top2: 0.8660 - 1s/step
step 2410/12272 - loss: 0.7245 - acc_top1: 0.6270 - acc_top2: 0.8662 - 1s/step
step 2420/12272 - loss: 0.6349 - acc_top1: 0.6274 - acc_top2: 0.8665 - 1s/step
step 2430/12272 - loss: 0.7009 - acc_top1: 0.6278 - acc_top2: 0.8668 - 1s/step
step 2440/12272 - loss: 0.3881 - acc_top1: 0.6282 - acc_top2: 0.8670 - 1s/step
step 2450/12272 - loss: 0.5226 - acc_top1: 0.6286 - acc_top2: 0.8673 - 1s/step
step 2460/12272 - loss: 0.5748 - acc_top1: 0.6292 - acc_top2: 0.8675 - 1s/step
step 2470/12272 - loss: 0.4798 - acc_top1: 0.6297 - acc_top2: 0.8678 - 1s/step
step 2480/12272 - loss: 0.5857 - acc_top1: 0.6303 - acc_top2: 0.8680 - 1s/step
step 2490/12272 - loss: 0.6729 - acc_top1: 0.6308 - acc_top2: 0.8683 - 1s/step
step 2500/12272 - loss: 0.6392 - acc_top1: 0.6312 - acc_top2: 0.8686 - 1s/step
step 2510/12272 - loss: 0.9607 - acc_top1: 0.6315 - acc_top2: 0.8687 - 1s/step
step 2520/12272 - loss: 0.6036 - acc_top1: 0.6319 - acc_top2: 0.8690 - 1s/step
step 2530/12272 - loss: 0.6505 - acc_top1: 0.6324 - acc_top2: 0.8693 - 1s/step
step 2540/12272 - loss: 0.4558 - acc_top1: 0.6329 - acc_top2: 0.8696 - 1s/step
step 2550/12272 - loss: 0.4215 - acc_top1: 0.6333 - acc_top2: 0.8699 - 1s/step
step 2560/12272 - loss: 0.6908 - acc_top1: 0.6338 - acc_top2: 0.8701 - 1s/step
step 2570/12272 - loss: 0.5833 - acc_top1: 0.6342 - acc_top2: 0.8703 - 1s/step
step 2580/12272 - loss: 0.8548 - acc_top1: 0.6346 - acc_top2: 0.8706 - 1s/step
step 2590/12272 - loss: 0.5770 - acc_top1: 0.6351 - acc_top2: 0.8708 - 1s/step
step 2600/12272 - loss: 0.4476 - acc_top1: 0.6355 - acc_top2: 0.8711 - 1s/step
step 2610/12272 - loss: 0.4145 - acc_top1: 0.6360 - acc_top2: 0.8714 - 1s/step
step 2620/12272 - loss: 0.6625 - acc_top1: 0.6365 - acc_top2: 0.8717 - 1s/step
step 2630/12272 - loss: 0.4808 - acc_top1: 0.6369 - acc_top2: 0.8719 - 1s/step
#!/bin/bash
BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
TASK_NAME='MNLI'
DATA_PATH="./data/glue_data/MNLI/"
CKPT_PATH="./data/saved_model/mnli_models"
# start fine-tuning
python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\
--use_cuda true \
--do_train true \
--do_test true \
--batch_size 64 \
--init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--validation_steps 100 \
--epoch 3 \
--max_seq_len 128 \
--bert_config_path ${BERT_BASE_PATH}/bert_config.json \
--learning_rate 5e-5 \
--skip_steps 10 \
--shuffle true
...@@ -4,7 +4,7 @@ TASK_NAME='MNLI' ...@@ -4,7 +4,7 @@ TASK_NAME='MNLI'
DATA_PATH="./data/glue_data/MNLI/" DATA_PATH="./data/glue_data/MNLI/"
CKPT_PATH="./data/saved_model/mnli_models" CKPT_PATH="./data/saved_model/mnli_models"
export CUDA_VISIBLE_DEVICES=7 export CUDA_VISIBLE_DEVICES=0
# start fine-tuning # start fine-tuning
python3.7 bert_classifier.py\ python3.7 bert_classifier.py\
......
...@@ -47,10 +47,15 @@ class BmnMetric(Metric): ...@@ -47,10 +47,15 @@ class BmnMetric(Metric):
if not os.path.isdir(self.cfg.INFER.result_path): if not os.path.isdir(self.cfg.INFER.result_path):
os.makedirs(self.cfg.INFER.result_path) os.makedirs(self.cfg.INFER.result_path)
def add_metric_op(self, preds, label): def add_metric_op(self, *args):
pred_bm, pred_start, pred_en = preds if self.mode == 'test':
video_index = label[-1] # only extract pred_bm, pred_start, pred_en from outputs
return [pred_bm, pred_start, pred_en, video_index] #return list # and video_index from label here
pred_bm, pred_start, pred_en, _, _, _, video_index = args
else:
# in infer mode, labels only contains video_index
pred_bm, pred_start, pred_en, video_index = args
return pred_bm, pred_start, pred_en, video_index
def update(self, pred_bm, pred_start, pred_end, fid): def update(self, pred_bm, pred_start, pred_end, fid):
# generate proposals # generate proposals
......
...@@ -19,7 +19,8 @@ import logging ...@@ -19,7 +19,8 @@ import logging
import paddle.fluid as fluid import paddle.fluid as fluid
from hapi.model import set_device, Input from hapi.model import set_device, Input
from hapi.vision.models import bmn, BmnLoss
from modeling import bmn, BmnLoss
from bmn_metric import BmnMetric from bmn_metric import BmnMetric
from reader import BmnDataset from reader import BmnDataset
from config_utils import * from config_utils import *
......
...@@ -26,7 +26,7 @@ DATATYPE = 'float32' ...@@ -26,7 +26,7 @@ DATATYPE = 'float32'
pretrain_infos = { pretrain_infos = {
'bmn': ('https://paddlemodels.bj.bcebos.com/hapi/bmn.pdparams', 'bmn': ('https://paddlemodels.bj.bcebos.com/hapi/bmn.pdparams',
'9286c821acc4cad46d6613b931ba468c') 'aa84e3386e1fbd117fb96fa572feeb94')
} }
...@@ -462,5 +462,5 @@ def bmn(tscale, ...@@ -462,5 +462,5 @@ def bmn(tscale,
weight_path = get_weights_path(*(pretrain_infos['bmn'])) weight_path = get_weights_path(*(pretrain_infos['bmn']))
assert weight_path.endswith('.pdparams'), \ assert weight_path.endswith('.pdparams'), \
"suffix of weight must be .pdparams" "suffix of weight must be .pdparams"
model.load(weight_path[:-9]) model.load(weight_path)
return model return model
...@@ -19,7 +19,8 @@ import logging ...@@ -19,7 +19,8 @@ import logging
import paddle.fluid as fluid import paddle.fluid as fluid
from hapi.model import set_device, Input from hapi.model import set_device, Input
from hapi.vision.models import bmn, BmnLoss
from modeling import bmn, BmnLoss
from bmn_metric import BmnMetric from bmn_metric import BmnMetric
from reader import BmnDataset from reader import BmnDataset
from config_utils import * from config_utils import *
......
...@@ -19,9 +19,10 @@ import sys ...@@ -19,9 +19,10 @@ import sys
import os import os
from hapi.model import set_device, Input from hapi.model import set_device, Input
from hapi.vision.models import bmn, BmnLoss
from reader import BmnDataset from reader import BmnDataset
from config_utils import * from config_utils import *
from modeling import bmn, BmnLoss
DATATYPE = 'float32' DATATYPE = 'float32'
......
简介
--------
本OCR任务是识别图片单行的字母信息,基于attention的seq2seq结构。 运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。
## 代码结构
```
.
|-- data.py # 数据读取
|-- eval.py # 评估脚本
|-- images # 测试图片
|-- predict.py # 预测脚本
|-- seq2seq_attn.py # 模型
|-- train.py # 训练脚本
`-- utility.py # 公共模块
```
## 训练/评估/预测流程
- 设置GPU环境:
```
export CUDA_VISIBLE_DEVICES=0
```
- 训练
```
python train.py
```
更多参数可以通过`--help`查看。
- 动静切换
```
python train.py --dynamic=True
```
- 评估
```
python eval.py --init_model=checkpoint/final
```
- 预测
目前不支持动态图预测
```
python predict.py --init_model=checkpoint/final --image_path=images/ --dynamic=False --beam_size=3
```
预测结果如下:
```
Image 1: images/112_chubbiness_13557.jpg
0: chubbines
1: chubbiness
2: chubbinesS
Image 2: images/177_Interfiled_40185.jpg
0: Interflied
1: Interfiled
2: InterfIled
Image 3: images/325_dame_19109.jpg
0: da
1: damo
2: dame
Image 4: images/368_fixtures_29232.jpg
0: firtures
1: Firtures
2: fixtures
```
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from os import path
import random
import traceback
import copy
import math
import tarfile
from PIL import Image
import logging
logger = logging.getLogger(__name__)
import paddle
from paddle import fluid
from paddle.fluid.dygraph.parallel import ParallelEnv
DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz"
CACHE_DIR_NAME = "attention_data"
SAVED_FILE_NAME = "data.tar.gz"
DATA_DIR_NAME = "data"
TRAIN_DATA_DIR_NAME = "train_images"
TEST_DATA_DIR_NAME = "test_images"
TRAIN_LIST_FILE_NAME = "train.list"
TEST_LIST_FILE_NAME = "test.list"
class Resize(object):
def __init__(self, height=48):
self.interp = Image.NEAREST # Image.ANTIALIAS
self.height = height
def __call__(self, samples):
shape = samples[0][0].size
for i in range(len(samples)):
im = samples[i][0]
im = im.resize((shape[0], self.height), self.interp)
samples[i][0] = im
return samples
class Normalize(object):
def __init__(self,
mean=[127.5],
std=[1.0],
scale=False,
channel_first=True):
self.mean = mean
self.std = std
self.scale = scale
self.channel_first = channel_first
if not (isinstance(self.mean, list) and isinstance(self.std, list) and
isinstance(self.scale, bool)):
raise TypeError("{}: input type is invalid.".format(self))
def __call__(self, samples):
for i in range(len(samples)):
im = samples[i][0]
im = np.array(im).astype(np.float32, copy=False)
im = im[np.newaxis, ...]
mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
std = np.array(self.std)[np.newaxis, np.newaxis, :]
if self.scale:
im = im / 255.0
#im -= mean
im -= 127.5
#im /= std
samples[i][0] = im
return samples
class PadTarget(object):
def __init__(self, SOS=0, EOS=1):
self.SOS = SOS
self.EOS = EOS
def __call__(self, samples):
lens = np.array([len(s[1]) for s in samples], dtype="int64")
max_len = np.max(lens)
for i in range(len(samples)):
label = samples[i][1]
if max_len > len(label):
pad_label = label + [self.EOS] * (max_len - len(label))
else:
pad_label = label
samples[i][1] = np.array([self.SOS] + pad_label, dtype='int64')
# label_out
samples[i].append(np.array(pad_label + [self.EOS], dtype='int64'))
mask = np.zeros((max_len + 1)).astype('float32')
mask[:len(label) + 1] = 1.0
# mask
samples[i].append(np.array(mask, dtype='float32'))
return samples
class BatchSampler(fluid.io.BatchSampler):
def __init__(self,
dataset,
batch_size,
shuffle=False,
drop_last=True,
seed=None):
self._dataset = dataset
self._batch_size = batch_size
self._shuffle = shuffle
self._drop_last = drop_last
self._random = np.random
self._random.seed(seed)
self._nranks = ParallelEnv().nranks
self._local_rank = ParallelEnv().local_rank
self._device_id = ParallelEnv().dev_id
self._num_samples = int(
math.ceil(len(self._dataset) * 1.0 / self._nranks))
self._total_size = self._num_samples * self._nranks
self._epoch = 0
def __iter__(self):
infos = copy.copy(self._dataset._sample_infos)
skip_num = 0
if self._shuffle:
if self._batch_size == 1:
self._random.RandomState(self._epoch).shuffle(infos)
else: # partial shuffle
infos = sorted(infos, key=lambda x: x.w)
skip_num = random.randint(1, 100)
infos = infos[skip_num:] + infos[:skip_num]
infos += infos[:(self._total_size - len(infos))]
last_size = self._total_size % (self._batch_size * self._nranks)
batches = []
for i in range(self._local_rank * self._batch_size,
len(infos) - last_size,
self._batch_size * self._nranks):
batches.append(infos[i:i + self._batch_size])
if (not self._drop_last) and last_size != 0:
last_local_size = last_size // self._nranks
last_infos = infos[len(infos) - last_size:]
start = self._local_rank * last_local_size
batches.append(last_infos[start:start + last_local_size])
if self._shuffle:
self._random.RandomState(self._epoch).shuffle(batches)
self._epoch += 1
for batch in batches:
batch_indices = [info.idx for info in batch]
yield batch_indices
def __len__(self):
if self._drop_last:
return self._total_size // self._batch_size
else:
return math.ceil(self._total_size / float(self._batch_size))
class SampleInfo(object):
def __init__(self, idx, h, w, im_name, labels):
self.idx = idx
self.h = h
self.w = w
self.im_name = im_name
self.labels = labels
class OCRDataset(paddle.io.Dataset):
def __init__(self, image_dir, anno_file):
self.image_dir = image_dir
self.anno_file = anno_file
self._sample_infos = []
with open(anno_file, 'r') as f:
for i, line in enumerate(f):
w, h, im_name, labels = line.strip().split(' ')
h, w = int(h), int(w)
labels = [int(c) for c in labels.split(',')]
self._sample_infos.append(SampleInfo(i, h, w, im_name, labels))
def __getitem__(self, idx):
info = self._sample_infos[idx]
im_name, labels = info.im_name, info.labels
image = Image.open(path.join(self.image_dir, im_name)).convert('L')
return [image, labels]
def __len__(self):
return len(self._sample_infos)
def train(
root_dir=None,
images_dir=None,
anno_file=None,
shuffle=True, ):
if root_dir is None:
root_dir = download_data()
if images_dir is None:
images_dir = TRAIN_DATA_DIR_NAME
images_dir = path.join(root_dir, TRAIN_DATA_DIR_NAME)
if anno_file is None:
anno_file = TRAIN_LIST_FILE_NAME
anno_file = path.join(root_dir, TRAIN_LIST_FILE_NAME)
return OCRDataset(images_dir, anno_file)
def test(
root_dir=None,
images_dir=None,
anno_file=None,
shuffle=True, ):
if root_dir is None:
root_dir = download_data()
if images_dir is None:
images_dir = TEST_DATA_DIR_NAME
images_dir = path.join(root_dir, TEST_DATA_DIR_NAME)
if anno_file is None:
anno_file = TEST_LIST_FILE_NAME
anno_file = path.join(root_dir, TEST_LIST_FILE_NAME)
return OCRDataset(images_dir, anno_file)
def download_data():
'''Download train and test data.
'''
tar_file = paddle.dataset.common.download(
DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
if not path.isdir(data_dir):
t = tarfile.open(tar_file, "r:gz")
t.extractall(path=path.dirname(tar_file))
t.close()
return data_dir
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import functools
import paddle.fluid.profiler as profiler
import paddle.fluid as fluid
from hapi.model import Input, set_device
from hapi.vision.transforms import BatchCompose
from utility import add_arguments, print_arguments
from utility import SeqAccuracy, LoggerCallBack, SeqBeamAccuracy
from utility import postprocess
from seq2seq_attn import Seq2SeqAttModel, Seq2SeqAttInferModel, WeightCrossEntropy
import data
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('test_images', str, None, "The directory of images to be used for test.")
add_arg('test_list', str, None, "The list file of images to be used for training.")
add_arg('init_model', str, 'checkpoint/final', "The init model file of directory.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('encoder_size', int, 200, "Encoder size.")
add_arg('decoder_size', int, 128, "Decoder size.")
add_arg('embedding_dim', int, 128, "Word vector dim.")
add_arg('num_classes', int, 95, "Number classes.")
add_arg('beam_size', int, 0, "If set beam size, will use beam search.")
add_arg('dynamic', bool, False, "Whether to use dygraph.")
# yapf: enable
def main(FLAGS):
device = set_device("gpu" if FLAGS.use_gpu else "cpu")
fluid.enable_dygraph(device) if FLAGS.dynamic else None
model = Seq2SeqAttModel(
encoder_size=FLAGS.encoder_size,
decoder_size=FLAGS.decoder_size,
emb_dim=FLAGS.embedding_dim,
num_classes=FLAGS.num_classes)
# yapf: disable
inputs = [
Input([None, 1, 48, 384], "float32", name="pixel"),
Input([None, None], "int64", name="label_in")
]
labels = [
Input([None, None], "int64", name="label_out"),
Input([None, None], "float32", name="mask")
]
# yapf: enable
model.prepare(
loss_function=WeightCrossEntropy(),
metrics=SeqAccuracy(),
inputs=inputs,
labels=labels,
device=device)
model.load(FLAGS.init_model)
test_dataset = data.test()
test_collate_fn = BatchCompose(
[data.Resize(), data.Normalize(), data.PadTarget()])
test_sampler = data.BatchSampler(
test_dataset,
batch_size=FLAGS.batch_size,
drop_last=False,
shuffle=False)
test_loader = fluid.io.DataLoader(
test_dataset,
batch_sampler=test_sampler,
places=device,
num_workers=0,
return_list=True,
collate_fn=test_collate_fn)
model.evaluate(
eval_data=test_loader,
callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
def beam_search(FLAGS):
device = set_device("gpu" if FLAGS.use_gpu else "cpu")
fluid.enable_dygraph(device) if FLAGS.dynamic else None
model = Seq2SeqAttInferModel(
encoder_size=FLAGS.encoder_size,
decoder_size=FLAGS.decoder_size,
emb_dim=FLAGS.embedding_dim,
num_classes=FLAGS.num_classes,
beam_size=FLAGS.beam_size)
inputs = [
Input(
[None, 1, 48, 384], "float32", name="pixel"), Input(
[None, None], "int64", name="label_in")
]
labels = [
Input(
[None, None], "int64", name="label_out"), Input(
[None, None], "float32", name="mask")
]
model.prepare(
loss_function=None,
metrics=SeqBeamAccuracy(),
inputs=inputs,
labels=labels,
device=device)
model.load(FLAGS.init_model)
test_dataset = data.test()
test_collate_fn = BatchCompose(
[data.Resize(), data.Normalize(), data.PadTarget()])
test_sampler = data.BatchSampler(
test_dataset,
batch_size=FLAGS.batch_size,
drop_last=False,
shuffle=False)
test_loader = fluid.io.DataLoader(
test_dataset,
batch_sampler=test_sampler,
places=device,
num_workers=0,
return_list=True,
collate_fn=test_collate_fn)
model.evaluate(
eval_data=test_loader,
callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
if __name__ == '__main__':
FLAGS = parser.parse_args()
print_arguments(FLAGS)
if FLAGS.beam_size:
beam_search(FLAGS)
else:
main(FLAGS)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import sys
import random
import numpy as np
import argparse
import functools
from PIL import Image
import paddle.fluid.profiler as profiler
import paddle.fluid as fluid
from hapi.model import Input, set_device
from hapi.datasets.folder import ImageFolder
from hapi.vision.transforms import BatchCompose
from utility import add_arguments, print_arguments
from utility import postprocess, index2word
from seq2seq_attn import Seq2SeqAttInferModel, WeightCrossEntropy
import data
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 1, "Minibatch size.")
add_arg('image_path', str, None, "The directory of images to be used for test.")
add_arg('init_model', str, None, "The init model file of directory.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
# model hyper paramters
add_arg('encoder_size', int, 200, "Encoder size.")
add_arg('decoder_size', int, 128, "Decoder size.")
add_arg('embedding_dim', int, 128, "Word vector dim.")
add_arg('num_classes', int, 95, "Number classes.")
add_arg('beam_size', int, 3, "Beam size for beam search.")
add_arg('dynamic', bool, False, "Whether to use dygraph.")
# yapf: enable
def main(FLAGS):
device = set_device("gpu" if FLAGS.use_gpu else "cpu")
fluid.enable_dygraph(device) if FLAGS.dynamic else None
model = Seq2SeqAttInferModel(
encoder_size=FLAGS.encoder_size,
decoder_size=FLAGS.decoder_size,
emb_dim=FLAGS.embedding_dim,
num_classes=FLAGS.num_classes,
beam_size=FLAGS.beam_size)
inputs = [Input([None, 1, 48, 384], "float32", name="pixel"), ]
model.prepare(inputs=inputs, device=device)
model.load(FLAGS.init_model)
fn = lambda p: Image.open(p).convert('L')
test_dataset = ImageFolder(FLAGS.image_path, loader=fn)
test_collate_fn = BatchCompose([data.Resize(), data.Normalize()])
test_loader = fluid.io.DataLoader(
test_dataset,
places=device,
num_workers=0,
return_list=True,
collate_fn=test_collate_fn)
samples = test_dataset.samples
#outputs = model.predict(test_loader)
ins_id = 0
for image, in test_loader:
image = image if FLAGS.dynamic else image[0]
pred = model.test_batch([image])[0]
pred = pred[:, :, np.newaxis] if len(pred.shape) == 2 else pred
pred = np.transpose(pred, [0, 2, 1])
for ins in pred:
impath = samples[ins_id]
ins_id += 1
print('Image {}: {}'.format(ins_id, impath))
for beam_idx, beam in enumerate(ins):
id_list = postprocess(beam)
word_list = index2word(id_list)
sequence = "".join(word_list)
print('{}: {}'.format(beam_idx, sequence))
if __name__ == '__main__':
FLAGS = parser.parse_args()
print_arguments(FLAGS)
main(FLAGS)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layers import BeamSearchDecoder
from hapi.text import RNNCell, RNN, DynamicDecode
from hapi.model import Model, Loss
class ConvBNPool(fluid.dygraph.Layer):
def __init__(self,
in_ch,
out_ch,
act="relu",
is_test=False,
pool=True,
use_cudnn=True):
super(ConvBNPool, self).__init__()
self.pool = pool
filter_size = 3
std = (2.0 / (filter_size**2 * in_ch))**0.5
param_0 = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, std))
std = (2.0 / (filter_size**2 * out_ch))**0.5
param_1 = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, std))
self.conv0 = fluid.dygraph.Conv2D(
in_ch,
out_ch,
3,
padding=1,
param_attr=param_0,
bias_attr=False,
act=None,
use_cudnn=use_cudnn)
self.bn0 = fluid.dygraph.BatchNorm(out_ch, act=act)
self.conv1 = fluid.dygraph.Conv2D(
out_ch,
out_ch,
filter_size=3,
padding=1,
param_attr=param_1,
bias_attr=False,
act=None,
use_cudnn=use_cudnn)
self.bn1 = fluid.dygraph.BatchNorm(out_ch, act=act)
if self.pool:
self.pool = fluid.dygraph.Pool2D(
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=use_cudnn,
ceil_mode=True)
def forward(self, inputs):
out = self.conv0(inputs)
out = self.bn0(out)
out = self.conv1(out)
out = self.bn1(out)
if self.pool:
out = self.pool(out)
return out
class CNN(fluid.dygraph.Layer):
def __init__(self, in_ch=1, is_test=False):
super(CNN, self).__init__()
self.conv_bn1 = ConvBNPool(in_ch, 16)
self.conv_bn2 = ConvBNPool(16, 32)
self.conv_bn3 = ConvBNPool(32, 64)
self.conv_bn4 = ConvBNPool(64, 128, pool=False)
def forward(self, inputs):
conv = self.conv_bn1(inputs)
conv = self.conv_bn2(conv)
conv = self.conv_bn3(conv)
conv = self.conv_bn4(conv)
return conv
class GRUCell(RNNCell):
def __init__(self,
input_size,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation='sigmoid',
candidate_activation='tanh',
origin_mode=False):
super(GRUCell, self).__init__()
self.hidden_size = hidden_size
self.fc_layer = fluid.dygraph.Linear(
input_size,
hidden_size * 3,
param_attr=param_attr,
bias_attr=False)
self.gru_unit = fluid.dygraph.GRUUnit(
hidden_size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
def forward(self, inputs, states):
# step_outputs, new_states = cell(step_inputs, states)
# for GRUCell, `step_outputs` and `new_states` both are hidden
x = self.fc_layer(inputs)
hidden, _, _ = self.gru_unit(x, states)
return hidden, hidden
@property
def state_shape(self):
return [self.hidden_size]
class Encoder(fluid.dygraph.Layer):
def __init__(
self,
in_channel=1,
rnn_hidden_size=200,
decoder_size=128,
is_test=False, ):
super(Encoder, self).__init__()
self.rnn_hidden_size = rnn_hidden_size
self.backbone = CNN(in_ch=in_channel, is_test=is_test)
para_attr = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
self.gru_fwd = RNN(cell=GRUCell(
input_size=128 * 6,
hidden_size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu'),
is_reverse=False,
time_major=False)
self.gru_bwd = RNN(cell=GRUCell(
input_size=128 * 6,
hidden_size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu'),
is_reverse=True,
time_major=False)
self.encoded_proj_fc = fluid.dygraph.Linear(
rnn_hidden_size * 2, decoder_size, bias_attr=False)
def forward(self, inputs):
conv_features = self.backbone(inputs)
conv_features = fluid.layers.transpose(
conv_features, perm=[0, 3, 1, 2])
n, w, c, h = conv_features.shape
seq_feature = fluid.layers.reshape(conv_features, [0, -1, c * h])
gru_fwd, _ = self.gru_fwd(seq_feature)
gru_bwd, _ = self.gru_bwd(seq_feature)
encoded_vector = fluid.layers.concat(input=[gru_fwd, gru_bwd], axis=2)
encoded_proj = self.encoded_proj_fc(encoded_vector)
return gru_bwd, encoded_vector, encoded_proj
class Attention(fluid.dygraph.Layer):
"""
Neural Machine Translation by Jointly Learning to Align and Translate.
https://arxiv.org/abs/1409.0473
"""
def __init__(self, decoder_size):
super(Attention, self).__init__()
self.fc1 = fluid.dygraph.Linear(
decoder_size, decoder_size, bias_attr=False)
self.fc2 = fluid.dygraph.Linear(decoder_size, 1, bias_attr=False)
def forward(self, encoder_vec, encoder_proj, decoder_state):
# alignment model, single-layer multilayer perceptron
decoder_state = self.fc1(decoder_state)
decoder_state = fluid.layers.unsqueeze(decoder_state, [1])
e = fluid.layers.elementwise_add(encoder_proj, decoder_state)
e = fluid.layers.tanh(e)
att_scores = self.fc2(e)
att_scores = fluid.layers.squeeze(att_scores, [2])
att_scores = fluid.layers.softmax(att_scores)
context = fluid.layers.elementwise_mul(
x=encoder_vec, y=att_scores, axis=0)
context = fluid.layers.reduce_sum(context, dim=1)
return context
class DecoderCell(RNNCell):
def __init__(self, encoder_size=200, decoder_size=128):
super(DecoderCell, self).__init__()
self.attention = Attention(decoder_size)
self.gru_cell = GRUCell(
input_size=encoder_size * 2 + decoder_size,
hidden_size=decoder_size)
def forward(self, current_word, states, encoder_vec, encoder_proj):
context = self.attention(encoder_vec, encoder_proj, states)
decoder_inputs = fluid.layers.concat([current_word, context], axis=1)
hidden, _ = self.gru_cell(decoder_inputs, states)
return hidden, hidden
class Decoder(fluid.dygraph.Layer):
def __init__(self, num_classes, emb_dim, encoder_size, decoder_size):
super(Decoder, self).__init__()
self.decoder_attention = RNN(DecoderCell(encoder_size, decoder_size))
self.fc = fluid.dygraph.Linear(
decoder_size, num_classes + 2, act='softmax')
def forward(self, target, initial_states, encoder_vec, encoder_proj):
out, _ = self.decoder_attention(
target,
initial_states=initial_states,
encoder_vec=encoder_vec,
encoder_proj=encoder_proj)
pred = self.fc(out)
return pred
class Seq2SeqAttModel(Model):
def __init__(
self,
in_channle=1,
encoder_size=200,
decoder_size=128,
emb_dim=128,
num_classes=None, ):
super(Seq2SeqAttModel, self).__init__()
self.encoder = Encoder(in_channle, encoder_size, decoder_size)
self.fc = fluid.dygraph.Linear(
input_dim=encoder_size,
output_dim=decoder_size,
bias_attr=False,
act='relu')
self.embedding = fluid.dygraph.Embedding(
[num_classes + 2, emb_dim], dtype='float32')
self.decoder = Decoder(num_classes, emb_dim, encoder_size,
decoder_size)
def forward(self, inputs, target):
gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
decoder_boot = self.fc(gru_backward[:, 0])
trg_embedding = self.embedding(target)
prediction = self.decoder(trg_embedding, decoder_boot, encoded_vector,
encoded_proj)
return prediction
class Seq2SeqAttInferModel(Seq2SeqAttModel):
def __init__(
self,
in_channle=1,
encoder_size=200,
decoder_size=128,
emb_dim=128,
num_classes=None,
beam_size=0,
bos_id=0,
eos_id=1,
max_out_len=20, ):
super(Seq2SeqAttInferModel, self).__init__(
in_channle, encoder_size, decoder_size, emb_dim, num_classes)
self.beam_size = beam_size
# dynamic decoder for inference
decoder = BeamSearchDecoder(
self.decoder.decoder_attention.cell,
start_token=bos_id,
end_token=eos_id,
beam_size=beam_size,
embedding_fn=self.embedding,
output_fn=self.decoder.fc)
self.infer_decoder = DynamicDecode(
decoder, max_step_num=max_out_len, is_test=True)
def forward(self, inputs, *args):
gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
decoder_boot = self.fc(gru_backward[:, 0])
if self.beam_size:
# Tile the batch dimension with beam_size
encoded_vector = BeamSearchDecoder.tile_beam_merge_with_batch(
encoded_vector, self.beam_size)
encoded_proj = BeamSearchDecoder.tile_beam_merge_with_batch(
encoded_proj, self.beam_size)
# dynamic decoding with beam search
rs, _ = self.infer_decoder(
inits=decoder_boot,
encoder_vec=encoded_vector,
encoder_proj=encoded_proj)
return rs
class WeightCrossEntropy(Loss):
def __init__(self):
super(WeightCrossEntropy, self).__init__(average=False)
def forward(self, outputs, labels):
predict, (label, mask) = outputs[0], labels
loss = layers.cross_entropy(predict, label=label)
loss = layers.elementwise_mul(loss, mask, axis=0)
loss = layers.reduce_sum(loss)
return loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import sys
import random
import numpy as np
import argparse
import functools
import paddle.fluid.profiler as profiler
import paddle.fluid as fluid
from hapi.model import Input, set_device
from hapi.vision.transforms import BatchCompose
from utility import add_arguments, print_arguments
from utility import SeqAccuracy, LoggerCallBack
from seq2seq_attn import Seq2SeqAttModel, WeightCrossEntropy
import data
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('epoch', int, 30, "Epoch number.")
add_arg('num_workers', int, 0, "workers number.")
add_arg('lr', float, 0.001, "Learning rate.")
add_arg('lr_decay_strategy', str, "", "Learning rate decay strategy.")
add_arg('checkpoint_path', str, "checkpoint", "The directory the model to be saved to.")
add_arg('train_images', str, None, "The directory of images to be used for training.")
add_arg('train_list', str, None, "The list file of images to be used for training.")
add_arg('test_images', str, None, "The directory of images to be used for test.")
add_arg('test_list', str, None, "The list file of images to be used for training.")
add_arg('resume_path', str, None, "The init model file of directory.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
# model hyper paramters
add_arg('encoder_size', int, 200, "Encoder size.")
add_arg('decoder_size', int, 128, "Decoder size.")
add_arg('embedding_dim', int, 128, "Word vector dim.")
add_arg('num_classes', int, 95, "Number classes.")
add_arg('gradient_clip', float, 5.0, "Gradient clip value.")
add_arg('dynamic', bool, False, "Whether to use dygraph.")
# yapf: enable
def main(FLAGS):
device = set_device("gpu" if FLAGS.use_gpu else "cpu")
fluid.enable_dygraph(device) if FLAGS.dynamic else None
model = Seq2SeqAttModel(
encoder_size=FLAGS.encoder_size,
decoder_size=FLAGS.decoder_size,
emb_dim=FLAGS.embedding_dim,
num_classes=FLAGS.num_classes)
lr = FLAGS.lr
if FLAGS.lr_decay_strategy == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay(
[200000, 250000], [lr, lr * 0.1, lr * 0.01])
else:
learning_rate = lr
grad_clip = fluid.clip.GradientClipByGlobalNorm(FLAGS.gradient_clip)
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate,
parameter_list=model.parameters(),
grad_clip=grad_clip)
# yapf: disable
inputs = [
Input([None,1,48,384], "float32", name="pixel"),
Input([None, None], "int64", name="label_in"),
]
labels = [
Input([None, None], "int64", name="label_out"),
Input([None, None], "float32", name="mask"),
]
# yapf: enable
model.prepare(
optimizer,
WeightCrossEntropy(),
SeqAccuracy(),
inputs=inputs,
labels=labels)
train_dataset = data.train()
train_collate_fn = BatchCompose(
[data.Resize(), data.Normalize(), data.PadTarget()])
train_sampler = data.BatchSampler(
train_dataset, batch_size=FLAGS.batch_size, shuffle=True)
train_loader = fluid.io.DataLoader(
train_dataset,
batch_sampler=train_sampler,
places=device,
num_workers=FLAGS.num_workers,
return_list=True,
collate_fn=train_collate_fn)
test_dataset = data.test()
test_collate_fn = BatchCompose(
[data.Resize(), data.Normalize(), data.PadTarget()])
test_sampler = data.BatchSampler(
test_dataset,
batch_size=FLAGS.batch_size,
drop_last=False,
shuffle=False)
test_loader = fluid.io.DataLoader(
test_dataset,
batch_sampler=test_sampler,
places=device,
num_workers=0,
return_list=True,
collate_fn=test_collate_fn)
model.fit(train_data=train_loader,
eval_data=test_loader,
epochs=FLAGS.epoch,
save_dir=FLAGS.checkpoint_path,
callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
if __name__ == '__main__':
FLAGS = parser.parse_args()
print_arguments(FLAGS)
main(FLAGS)
"""Contains common utility functions."""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
import paddle.fluid as fluid
import six
from hapi.metrics import Metric
from hapi.callbacks import ProgBarLogger
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
class SeqAccuracy(Metric):
def __init__(self, name=None, *args, **kwargs):
super(SeqAccuracy, self).__init__(*args, **kwargs)
self._name = 'seq_acc'
self.reset()
def add_metric_op(self, output, label, mask, *args, **kwargs):
pred = fluid.layers.flatten(output, axis=2)
score, topk = fluid.layers.topk(pred, 1)
return topk, label, mask
def update(self, topk, label, mask, *args, **kwargs):
topk = topk.reshape(label.shape[0], -1)
seq_len = np.sum(mask, -1)
acc = 0
for i in range(label.shape[0]):
l = int(seq_len[i] - 1)
pred = topk[i][:l - 1]
ref = label[i][:l - 1]
if np.array_equal(pred, ref):
self.total += 1
acc += 1
self.count += 1
return float(acc) / label.shape[0]
def reset(self):
self.total = 0.
self.count = 0.
def accumulate(self):
return float(self.total) / self.count
def name(self):
return self._name
class LoggerCallBack(ProgBarLogger):
def __init__(self, log_freq=1, verbose=2, train_bs=None, eval_bs=None):
super(LoggerCallBack, self).__init__(log_freq, verbose)
self.train_bs = train_bs
self.eval_bs = eval_bs if eval_bs else train_bs
def on_train_batch_end(self, step, logs=None):
logs = logs or {}
logs['loss'] = [l / self.train_bs for l in logs['loss']]
super(LoggerCallBack, self).on_train_batch_end(step, logs)
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
logs['loss'] = [l / self.train_bs for l in logs['loss']]
super(LoggerCallBack, self).on_epoch_end(epoch, logs)
def on_eval_batch_end(self, step, logs=None):
logs = logs or {}
logs['loss'] = [l / self.eval_bs for l in logs['loss']]
super(LoggerCallBack, self).on_eval_batch_end(step, logs)
def on_eval_end(self, logs=None):
logs = logs or {}
logs['loss'] = [l / self.eval_bs for l in logs['loss']]
super(LoggerCallBack, self).on_eval_end(logs)
def index2word(ids):
return [chr(int(k + 33)) for k in ids]
def postprocess(seq, bos_idx=0, eos_idx=1):
if type(seq) is np.ndarray:
seq = seq.tolist()
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = [
idx for idx in seq[:eos_pos + 1] if idx != bos_idx and idx != eos_idx
]
return seq
class SeqBeamAccuracy(Metric):
def __init__(self, name=None, *args, **kwargs):
super(SeqBeamAccuracy, self).__init__(*args, **kwargs)
self._name = 'seq_acc'
self.reset()
def add_metric_op(self, output, label, mask, *args, **kwargs):
return output, label, mask
def update(self, preds, labels, masks, *args, **kwargs):
preds = preds[:, :, np.newaxis] if len(preds.shape) == 2 else preds
preds = np.transpose(preds, [0, 2, 1])
seq_len = np.sum(masks, -1)
acc = 0
for i in range(labels.shape[0]):
l = int(seq_len[i] - 1)
#ref = labels[i][: l - 1]
ref = np.array(postprocess(labels[i]))
pred = preds[i]
for idx, beam in enumerate(pred):
beam_pred = np.array(postprocess(beam))
if np.array_equal(beam_pred, ref):
self.total += 1
acc += 1
break
self.count += 1
return float(acc) / labels.shape[0]
def reset(self):
self.total = 0.
self.count = 0.
def accumulate(self):
return float(self.total) / self.count
def name(self):
return self._name
...@@ -20,9 +20,9 @@ import argparse ...@@ -20,9 +20,9 @@ import argparse
import numpy as np import numpy as np
from hapi.model import Input, set_device from hapi.model import Input, set_device
from hapi.vision.models import tsm_resnet50
from check import check_gpu, check_version from check import check_gpu, check_version
from modeling import tsm_resnet50
from kinetics_dataset import KineticsDataset from kinetics_dataset import KineticsDataset
from transforms import * from transforms import *
......
...@@ -24,8 +24,8 @@ from paddle.fluid.dygraph.parallel import ParallelEnv ...@@ -24,8 +24,8 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
from hapi.model import Model, CrossEntropy, Input, set_device from hapi.model import Model, CrossEntropy, Input, set_device
from hapi.metrics import Accuracy from hapi.metrics import Accuracy
from hapi.vision.models import tsm_resnet50
from modeling import tsm_resnet50
from check import check_gpu, check_version from check import check_gpu, check_version
from kinetics_dataset import KineticsDataset from kinetics_dataset import KineticsDataset
from transforms import * from transforms import *
......
...@@ -196,7 +196,7 @@ def _tsm_resnet(num_layers, seg_num=8, num_classes=400, pretrained=True): ...@@ -196,7 +196,7 @@ def _tsm_resnet(num_layers, seg_num=8, num_classes=400, pretrained=True):
weight_path = get_weights_path(*(pretrain_infos[num_layers])) weight_path = get_weights_path(*(pretrain_infos[num_layers]))
assert weight_path.endswith('.pdparams'), \ assert weight_path.endswith('.pdparams'), \
"suffix of weight must be .pdparams" "suffix of weight must be .pdparams"
model.load(weight_path[:-9]) model.load(weight_path)
return model return model
......
...@@ -99,18 +99,12 @@ YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层 ...@@ -99,18 +99,12 @@ YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层
| ... | ...
``` ```
```bash
sh pretrain_weights/download.sh
```
### 模型训练 ### 模型训练
数据准备完成后,可使用`main.py`脚本启动训练和评估,如下脚本会自动每epoch交替进行训练和模型评估,并将checkpoint默认保存在`yolo_checkpoint`目录下。 数据准备完成后,可使用`main.py`脚本启动训练和评估,如下脚本会自动每epoch交替进行训练和模型评估,并将checkpoint默认保存在`yolo_checkpoint`目录下。
YOLOv3模型训练总batch_size为64训练,以下以使用4卡Tesla P40每卡batch_size为16训练介绍训练方式。对于静态图和动态图,多卡训练中`--batch_size`为每卡上的batch_size,即总batch_size为`--batch_size`乘以卡数。 YOLOv3模型训练总batch_size为64训练,以下以使用4卡Tesla P40每卡batch_size为16训练介绍训练方式。对于静态图和动态图,多卡训练中`--batch_size`为每卡上的batch_size,即总batch_size为`--batch_size`乘以卡数。
YOLOv3模型训练须加载骨干网络[DarkNet53]()的预训练权重,可在训练时通过`--pretrain_weights`指定,若指定为URL,将自动下载权重至`~/.cache/paddle/weights`目录并加载。
`main.py`脚本参数可通过如下命令查询 `main.py`脚本参数可通过如下命令查询
```bash ```bash
...@@ -122,7 +116,7 @@ python main.py --help ...@@ -122,7 +116,7 @@ python main.py --help
使用如下方式进行多卡训练: 使用如下方式进行多卡训练:
```bash ```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data=<path/to/dataset> --batch_size=16 --pretrain_weights=https://paddlemodels.bj.bcebos.com/hapi/darknet53_pretrained.pdparams CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data=<path/to/dataset> --batch_size=16
``` ```
#### 动态图训练 #### 动态图训练
...@@ -132,7 +126,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data= ...@@ -132,7 +126,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data=
使用如下方式进行多卡训练: 使用如下方式进行多卡训练:
```bash ```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py -m paddle.distributed.launch --data=<path/to/dataset> --batch_size=16 -d --pretrain_weights=https://paddlemodels.bj.bcebos.com/hapi/darknet53_pretrained.pdparams CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py -m paddle.distributed.launch --data=<path/to/dataset> --batch_size=16 -d
``` ```
......
...@@ -25,8 +25,9 @@ from paddle.fluid.optimizer import Momentum ...@@ -25,8 +25,9 @@ from paddle.fluid.optimizer import Momentum
from paddle.io import DataLoader from paddle.io import DataLoader
from hapi.model import Model, Input, set_device from hapi.model import Model, Input, set_device
from hapi.vision.models import yolov3_darknet53, YoloLoss
from hapi.vision.transforms import * from modeling import yolov3_darknet53, YoloLoss
from transforms import *
from visualizer import draw_bbox from visualizer import draw_bbox
......
...@@ -27,12 +27,12 @@ from paddle.io import DataLoader ...@@ -27,12 +27,12 @@ from paddle.io import DataLoader
from hapi.model import Model, Input, set_device from hapi.model import Model, Input, set_device
from hapi.distributed import DistributedBatchSampler from hapi.distributed import DistributedBatchSampler
from hapi.download import is_url, get_weights_path from hapi.vision.transforms import Compose, BatchCompose
from hapi.datasets import COCODataset
from hapi.vision.transforms import *
from hapi.vision.models import yolov3_darknet53, YoloLoss
from modeling import yolov3_darknet53, YoloLoss
from coco import COCODataset
from coco_metric import COCOMetric from coco_metric import COCOMetric
from transforms import *
NUM_MAX_BOXES = 50 NUM_MAX_BOXES = 50
...@@ -126,10 +126,7 @@ def main(): ...@@ -126,10 +126,7 @@ def main():
pretrained=pretrained) pretrained=pretrained)
if FLAGS.pretrain_weights and not FLAGS.eval_only: if FLAGS.pretrain_weights and not FLAGS.eval_only:
pretrain_weights = FLAGS.pretrain_weights model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True)
if is_url(pretrain_weights):
pretrain_weights = get_weights_path(pretrain_weights)
model.load(pretrain_weights, skip_mismatch=True, reset_optimizer=True)
optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters()) optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters())
...@@ -168,7 +165,7 @@ def main(): ...@@ -168,7 +165,7 @@ def main():
save_dir="yolo_checkpoint/mixup", save_dir="yolo_checkpoint/mixup",
save_freq=10) save_freq=10)
# do not use image mixup transfrom in laste FLAGS.no_mixup_epoch epoches # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches
dataset.mixup = False dataset.mixup = False
model.fit(train_data=loader, model.fit(train_data=loader,
epochs=FLAGS.no_mixup_epoch, epochs=FLAGS.no_mixup_epoch,
...@@ -200,8 +197,7 @@ if __name__ == '__main__': ...@@ -200,8 +197,7 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
"-j", "--num_workers", default=4, type=int, help="reader worker number") "-j", "--num_workers", default=4, type=int, help="reader worker number")
parser.add_argument( parser.add_argument(
"-p", "--pretrain_weights", "-p", "--pretrain_weights", default=None, type=str,
default="./pretrain_weights/darknet53_pretrained", type=str,
help="path to pretrained weights") help="path to pretrained weights")
parser.add_argument( parser.add_argument(
"-r", "--resume", default=None, type=str, "-r", "--resume", default=None, type=str,
......
...@@ -16,13 +16,13 @@ from __future__ import division ...@@ -16,13 +16,13 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Conv2D from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay from paddle.fluid.regularizer import L2Decay
from hapi.model import Model, Loss from hapi.model import Model, Loss
from hapi.download import get_weights_path from hapi.download import get_weights_path
from .darknet import darknet53, ConvBNLayer from hapi.vision.models import darknet53
__all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53'] __all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']
...@@ -33,6 +33,46 @@ pretrain_infos = { ...@@ -33,6 +33,46 @@ pretrain_infos = {
} }
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act="leaky"):
super(ConvBNLayer, self).__init__()
self.conv = Conv2D(
num_channels=ch_in,
num_filters=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=False,
act=None)
self.batch_norm = BatchNorm(
num_channels=ch_out,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02),
regularizer=L2Decay(0.)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.)))
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = fluid.layers.leaky_relu(x=out, alpha=0.1)
return out
class YoloDetectionBlock(fluid.dygraph.Layer): class YoloDetectionBlock(fluid.dygraph.Layer):
def __init__(self, ch_in, channel): def __init__(self, ch_in, channel):
super(YoloDetectionBlock, self).__init__() super(YoloDetectionBlock, self).__init__()
...@@ -118,7 +158,7 @@ class YOLOv3(Model): ...@@ -118,7 +158,7 @@ class YOLOv3(Model):
self.nms_posk = 100 self.nms_posk = 100
self.draw_thresh = 0.5 self.draw_thresh = 0.5
self.backbone = darknet53(pretrained=False) self.backbone = darknet53(pretrained=(model_mode=='train'))
self.block_outputs = [] self.block_outputs = []
self.yolo_blocks = [] self.yolo_blocks = []
self.route_blocks = [] self.route_blocks = []
...@@ -254,7 +294,7 @@ def _yolov3_darknet(num_layers=53, num_classes=80, ...@@ -254,7 +294,7 @@ def _yolov3_darknet(num_layers=53, num_classes=80,
weight_path = get_weights_path(*(pretrain_infos[num_layers])) weight_path = get_weights_path(*(pretrain_infos[num_layers]))
assert weight_path.endswith('.pdparams'), \ assert weight_path.endswith('.pdparams'), \
"suffix of weight must be .pdparams" "suffix of weight must be .pdparams"
model.load(weight_path[:-9]) model.load(weight_path)
return model return model
......
...@@ -218,8 +218,6 @@ class ProgBarLogger(Callback): ...@@ -218,8 +218,6 @@ class ProgBarLogger(Callback):
# if steps is not None, last step will update in on_epoch_end # if steps is not None, last step will update in on_epoch_end
if self.steps and self.train_step < self.steps: if self.steps and self.train_step < self.steps:
self._updates(logs, 'train') self._updates(logs, 'train')
else:
self._updates(logs, 'train')
def on_epoch_end(self, epoch, logs=None): def on_epoch_end(self, epoch, logs=None):
logs = logs or {} logs = logs or {}
...@@ -238,7 +236,7 @@ class ProgBarLogger(Callback): ...@@ -238,7 +236,7 @@ class ProgBarLogger(Callback):
def on_eval_batch_end(self, step, logs=None): def on_eval_batch_end(self, step, logs=None):
logs = logs or {} logs = logs or {}
self.eval_step = step self.eval_step += 1
samples = logs.get('batch_size', 1) samples = logs.get('batch_size', 1)
self.evaled_samples += samples self.evaled_samples += samples
......
...@@ -12,7 +12,14 @@ ...@@ -12,7 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from . import folder
from . import mnist
from . import flowers
from .folder import * from .folder import *
from .mnist import * from .mnist import *
from .flowers import * from .flowers import *
from .coco import *
__all__ = folder.__all__ \
+ mnist.__all__ \
+ flowers.__all__
...@@ -18,7 +18,7 @@ import cv2 ...@@ -18,7 +18,7 @@ import cv2
from paddle.io import Dataset from paddle.io import Dataset
__all__ = ["DatasetFolder"] __all__ = ["DatasetFolder", "ImageFolder"]
def has_valid_extension(filename, extensions): def has_valid_extension(filename, extensions):
...@@ -164,3 +164,80 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', ...@@ -164,3 +164,80 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
def cv2_loader(path): def cv2_loader(path):
return cv2.imread(path) return cv2.imread(path)
class ImageFolder(Dataset):
"""A generic data loader where the samples are arranged in this way:
root/1.ext
root/2.ext
root/sub_dir/3.ext
Args:
root (string): Root directory path.
loader (callable, optional): A function to load a sample given its path.
extensions (tuple[string], optional): A list of allowed extensions.
both extensions and is_valid_file should not be passed.
transform (callable, optional): A function/transform that takes in
a sample and returns a transformed version.
is_valid_file (callable, optional): A function that takes path of a file
and check if the file is a valid file (used to check of corrupt files)
both extensions and is_valid_file should not be passed.
Attributes:
samples (list): List of sample path
"""
def __init__(self,
root,
loader=None,
extensions=None,
transform=None,
is_valid_file=None):
self.root = root
if extensions is None:
extensions = IMG_EXTENSIONS
samples = []
path = os.path.expanduser(root)
if not ((extensions is None) ^ (is_valid_file is None)):
raise ValueError(
"Both extensions and is_valid_file cannot be None or not None at the same time"
)
if extensions is not None:
def is_valid_file(x):
return has_valid_extension(x, extensions)
for root, _, fnames in sorted(os.walk(path, followlinks=True)):
for fname in sorted(fnames):
f = os.path.join(root, fname)
if is_valid_file(f):
samples.append(f)
if len(samples) == 0:
raise (RuntimeError(
"Found 0 files in subfolders of: " + self.root + "\n"
"Supported extensions are: " + ",".join(extensions)))
self.loader = cv2_loader if loader is None else loader
self.extensions = extensions
self.samples = samples
self.transform = transform
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: (sample, target) where target is class_index of the target class.
"""
path = self.samples[index]
sample = self.loader(path)
if self.transform is not None:
sample = self.transform(sample)
return [sample]
def __len__(self):
return len(self.samples)
...@@ -816,7 +816,7 @@ class Model(fluid.dygraph.Layer): ...@@ -816,7 +816,7 @@ class Model(fluid.dygraph.Layer):
except ValueError as err: except ValueError as err:
if skip_mismatch: if skip_mismatch:
warnings.warn( warnings.warn(
("Skip loading for {}. ".format(key) + err.message)) ("Skip loading for {}. ".format(key) + str(err)))
# reset optimizer when mismatch happens # reset optimizer when mismatch happens
reset_optimizer = True reset_optimizer = True
else: else:
...@@ -1161,7 +1161,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1161,7 +1161,7 @@ class Model(fluid.dygraph.Layer):
if fluid.in_dygraph_mode(): if fluid.in_dygraph_mode():
feed_list = None feed_list = None
else: else:
feed_list = [x.forward() for x in self._inputs + self._labels] feed_list = [x.forward() for x in self._inputs]
if test_data is not None and isinstance(test_data, Dataset): if test_data is not None and isinstance(test_data, Dataset):
test_sampler = DistributedBatchSampler( test_sampler = DistributedBatchSampler(
...@@ -1281,7 +1281,7 @@ class Model(fluid.dygraph.Layer): ...@@ -1281,7 +1281,7 @@ class Model(fluid.dygraph.Layer):
if mode == 'train': if mode == 'train':
assert epoch is not None, 'when mode is train, epoch must be given' assert epoch is not None, 'when mode is train, epoch must be given'
callbacks.on_epoch_end(epoch) callbacks.on_epoch_end(epoch, logs)
return logs return logs
......
...@@ -30,6 +30,7 @@ from hapi.distributed import DistributedBatchSampler ...@@ -30,6 +30,7 @@ from hapi.distributed import DistributedBatchSampler
from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
from hapi.text.bert.batching import prepare_batch_data from hapi.text.bert.batching import prepare_batch_data
import hapi.text.tokenizer.tokenization as tokenization import hapi.text.tokenizer.tokenization as tokenization
from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
__all__ = [ __all__ = [
'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset', 'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
...@@ -227,6 +228,9 @@ class SingleSentenceDataset(Dataset): ...@@ -227,6 +228,9 @@ class SingleSentenceDataset(Dataset):
if line_processor is None: if line_processor is None:
line_processor = default_line_processor line_processor = default_line_processor
if ParallelEnv().nranks > 1:
leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)
if not os.path.exists(leveldb_file): if not os.path.exists(leveldb_file):
print("putting data %s into leveldb %s" % print("putting data %s into leveldb %s" %
(input_file, leveldb_file)) (input_file, leveldb_file))
...@@ -384,7 +388,12 @@ class BertDataLoader(object): ...@@ -384,7 +388,12 @@ class BertDataLoader(object):
quotechar=None, quotechar=None,
device=fluid.CPUPlace(), device=fluid.CPUPlace(),
num_workers=0, num_workers=0,
return_list=True): return_list=True,
phase="train"):
assert phase in [
"train", "predict", "test"
], "phase of BertDataLoader should be in [train, predict, test], but get %s" % phase
self.dataset = SingleSentenceDataset(tokenizer, label_list, self.dataset = SingleSentenceDataset(tokenizer, label_list,
max_seq_length, mode) max_seq_length, mode)
...@@ -394,15 +403,21 @@ class BertDataLoader(object): ...@@ -394,15 +403,21 @@ class BertDataLoader(object):
input_file, label_list, max_seq_length, tokenizer, input_file, label_list, max_seq_length, tokenizer,
line_processor, delimiter, quotechar) line_processor, delimiter, quotechar)
elif mode == "leveldb": elif mode == "leveldb":
#prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
self.dataset.prepare_leveldb(input_file, leveldb_file, label_list, self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
max_seq_length, tokenizer, max_seq_length, tokenizer,
line_processor, delimiter, quotechar) line_processor, delimiter, quotechar)
else: else:
raise ValueError("mode should be in [all_in_memory, leveldb]") raise ValueError("mode should be in [all_in_memory, leveldb]")
self.sampler = DistributedBatchSampler( if phase == "train":
self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last) self.sampler = DistributedBatchSampler(
self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
elif phase == "test" or phase == "predict":
self.sampler = BatchSampler(
dataset=self.dataset,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last)
self.dataloader = DataLoader( self.dataloader = DataLoader(
dataset=self.dataset, dataset=self.dataset,
......
...@@ -130,6 +130,18 @@ class Optimizer(object): ...@@ -130,6 +130,18 @@ class Optimizer(object):
return True return True
return False return False
def state_dict(self):
return self.optimizer.state_dict()
def set_dict(self, state_dict):
return self.optimizer.set_dict(state_dict)
def get_opti_var_name_list(self):
return self.optimizer.get_opti_var_name_list()
def current_step_lr(self):
return self.optimizer.current_step_lr()
def minimize(self, loss, use_data_parallel=False, model=None): def minimize(self, loss, use_data_parallel=False, model=None):
param_list = dict() param_list = dict()
......
...@@ -22,7 +22,7 @@ import sys ...@@ -22,7 +22,7 @@ import sys
if six.PY2: if six.PY2:
reload(sys) reload(sys)
sys.setdefaultencoding('utf8') sys.setdefaultencoding('utf8')
import ast import ast
import time import time
import argparse as argparse import argparse as argparse
...@@ -44,13 +44,12 @@ from paddle.fluid import layers ...@@ -44,13 +44,12 @@ from paddle.fluid import layers
from paddle.fluid.dygraph import Layer from paddle.fluid.dygraph import Layer
from paddle.fluid.layers import BeamSearchDecoder from paddle.fluid.layers import BeamSearchDecoder
__all__ = [ __all__ = [
'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode', 'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
'TransformerDecoder', 'TransformerBeamSearchDecoder', 'GRUCell', 'GRUEncoderCell', 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
'BiGRU', 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging' 'Crf_decoding', 'SequenceTagging'
] ]
...@@ -219,7 +218,19 @@ class BasicLSTMCell(RNNCell): ...@@ -219,7 +218,19 @@ class BasicLSTMCell(RNNCell):
gate_activation=None, gate_activation=None,
activation=None, activation=None,
forget_bias=1.0, forget_bias=1.0,
dtype='float32'): dtype='float32',
forget_gate_weights={"w": None,
"h": None,
"b": None},
input_gate_weights={"w": None,
"h": None,
"b": None},
output_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicLSTMCell, self).__init__() super(BasicLSTMCell, self).__init__()
self._hidden_size = hidden_size self._hidden_size = hidden_size
...@@ -233,25 +244,225 @@ class BasicLSTMCell(RNNCell): ...@@ -233,25 +244,225 @@ class BasicLSTMCell(RNNCell):
self._dtype = dtype self._dtype = dtype
self._input_size = input_size self._input_size = input_size
self._weight = self.create_parameter( self.use_customized_weight = False
attr=self._param_attr, for _weights in [
shape=[ forget_gate_weights, input_gate_weights, output_gate_weights,
self._input_size + self._hidden_size, 4 * self._hidden_size cell_weights
], ]:
dtype=self._dtype) for _key in _weights:
if _weights[_key] is not None:
self._bias = self.create_parameter( self.use_customized_weight = True
attr=self._bias_attr, break
shape=[4 * self._hidden_size], if self.use_customized_weight:
dtype=self._dtype, break
is_bias=True)
if not self.use_customized_weight:
self._weight = self.create_parameter(
attr=self._param_attr,
shape=[
self._input_size + self._hidden_size, 4 * self._hidden_size
],
dtype=self._dtype)
self._bias = self.create_parameter(
attr=self._bias_attr,
shape=[4 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
else:
if "w" in forget_gate_weights and forget_gate_weights[
"w"] is not None:
self.fg_w = forget_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_w"
else:
tmp_param_attr = self._param_attr
self.fg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in forget_gate_weights and forget_gate_weights[
"h"] is not None:
self.fg_h = forget_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_h"
else:
tmp_param_attr = self._param_attr
self.fg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in forget_gate_weights and forget_gate_weights[
"b"] is not None:
self.fg_b = forget_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_forget_gate_b"
else:
tmp_param_attr = self._bias_attr
self.fg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in input_gate_weights and input_gate_weights[
"w"] is not None:
self.ig_w = input_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_w"
else:
tmp_param_attr = self._param_attr
self.ig_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in input_gate_weights and input_gate_weights[
"h"] is not None:
self.ig_h = input_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_h"
else:
tmp_param_attr = self._param_attr
self.ig_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in input_gate_weights and input_gate_weights[
"b"] is not None:
self.ig_b = input_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_input_gate_b"
else:
tmp_param_attr = self._bias_attr
self.ig_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in output_gate_weights and output_gate_weights[
"w"] is not None:
self.og_w = output_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_w"
else:
tmp_param_attr = self._param_attr
self.og_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in output_gate_weights and output_gate_weights[
"h"] is not None:
self.og_h = output_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_h"
else:
tmp_param_attr = self._param_attr
self.og_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in output_gate_weights and output_gate_weights[
"b"] is not None:
self.og_b = output_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_output_gate_b"
else:
tmp_param_attr = self._bias_attr
self.og_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = self._param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None:
self.c_h = cell_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = self._param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = self._bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, state): def forward(self, input, state):
if self.use_customized_weight:
weight_w = fluid.layers.concat(
[self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
weight_h = fluid.layers.concat(
[self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
_weight = fluid.layers.concat([weight_w, weight_h], axis=0)
_bias = fluid.layers.concat(
[self.ig_b, self.c_b, self.fg_b, self.og_b])
else:
_weight = self._weight
_bias = self._bias
pre_hidden, pre_cell = state pre_hidden, pre_cell = state
concat_input_hidden = layers.concat([input, pre_hidden], 1) concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
gate_input = layers.elementwise_add(gate_input, self._bias) gate_input = layers.elementwise_add(gate_input, _bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add( new_cell = layers.elementwise_add(
layers.elementwise_mul( layers.elementwise_mul(
...@@ -308,7 +519,16 @@ class BasicGRUCell(RNNCell): ...@@ -308,7 +519,16 @@ class BasicGRUCell(RNNCell):
bias_attr=None, bias_attr=None,
gate_activation=None, gate_activation=None,
activation=None, activation=None,
dtype='float32'): dtype='float32',
update_gate_weights={"w": None,
"h": None,
"b": None},
reset_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicGRUCell, self).__init__() super(BasicGRUCell, self).__init__()
self._input_size = input_size self._input_size = input_size
self._hidden_size = hidden_size self._hidden_size = hidden_size
...@@ -318,6 +538,20 @@ class BasicGRUCell(RNNCell): ...@@ -318,6 +538,20 @@ class BasicGRUCell(RNNCell):
self._activation = activation or layers.tanh self._activation = activation or layers.tanh
self._dtype = dtype self._dtype = dtype
assert isinstance(update_gate_weights, dict)
assert isinstance(reset_gate_weights, dict)
assert isinstance(cell_weights, dict)
self.use_customized_weight = False
for _weights in [
update_gate_weights, reset_gate_weights, cell_weights
]:
for _key in _weights:
if _weights[_key] is not None:
self.use_customized_weight = True
if self.use_customized_weight:
break
if self._param_attr is not None and self._param_attr.name is not None: if self._param_attr is not None and self._param_attr.name is not None:
gate_param_attr = copy.deepcopy(self._param_attr) gate_param_attr = copy.deepcopy(self._param_attr)
candidate_param_attr = copy.deepcopy(self._param_attr) candidate_param_attr = copy.deepcopy(self._param_attr)
...@@ -327,43 +561,194 @@ class BasicGRUCell(RNNCell): ...@@ -327,43 +561,194 @@ class BasicGRUCell(RNNCell):
gate_param_attr = self._param_attr gate_param_attr = self._param_attr
candidate_param_attr = self._param_attr candidate_param_attr = self._param_attr
self._gate_weight = self.create_parameter( if not self.use_customized_weight:
attr=gate_param_attr, self._gate_weight = self.create_parameter(
shape=[self._input_size + self._hidden_size, 2 * self._hidden_size], attr=gate_param_attr,
dtype=self._dtype) shape=[
self._input_size + self._hidden_size, 2 * self._hidden_size
self._candidate_weight = self.create_parameter( ],
attr=candidate_param_attr, dtype=self._dtype)
shape=[self._input_size + self._hidden_size, self._hidden_size],
dtype=self._dtype) self._candidate_weight = self.create_parameter(
attr=candidate_param_attr,
shape=[
self._input_size + self._hidden_size, self._hidden_size
],
dtype=self._dtype)
if self._bias_attr is not None and self._bias_attr.name is not None:
gate_bias_attr = copy.deepcopy(self._bias_attr)
candidate_bias_attr = copy.deepcopy(self._bias_attr)
gate_bias_attr.name += "_gate"
candidate_bias_attr.name += "_candidate"
else:
gate_bias_attr = self._bias_attr
candidate_bias_attr = self._bias_attr
self._gate_bias = self.create_parameter(
attr=gate_bias_attr,
shape=[2 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
self._candidate_bias = self.create_parameter(
attr=candidate_bias_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
if self._bias_attr is not None and self._bias_attr.name is not None:
gate_bias_attr = copy.deepcopy(self._bias_attr)
candidate_bias_attr = copy.deepcopy(self._bias_attr)
gate_bias_attr.name += "_gate"
candidate_bias_attr.name += "_candidate"
else: else:
gate_bias_attr = self._bias_attr
candidate_bias_attr = self._bias_attr # create the parameters of gates in gru
if "w" in update_gate_weights and update_gate_weights[
self._gate_bias = self.create_parameter( "w"] is not None:
attr=gate_bias_attr, self.ug_w = update_gate_weights["w"]
shape=[2 * self._hidden_size], else:
dtype=self._dtype, if gate_param_attr is not None and gate_param_attr.name is not None:
is_bias=True) tmp_param_attr = copy.deepcopy(gate_param_attr)
self._candidate_bias = self.create_parameter( tmp_param_attr.name += "_update_gate_w"
attr=candidate_bias_attr, else:
shape=[self._hidden_size], tmp_param_attr = gate_param_attr
dtype=self._dtype, self.ug_w = self.create_parameter(
is_bias=True) attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in update_gate_weights and update_gate_weights[
"h"] is not None:
self.ug_h = update_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_update_gate_h"
else:
tmp_param_attr = gate_param_attr
self.ug_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in update_gate_weights and update_gate_weights[
"b"] is not None:
self.ug_b = update_gate_weights["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_update_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.ug_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# reset gate parameters
if "w" in reset_gate_weights and reset_gate_weights[
"w"] is not None:
self.rg_w = reset_gate_weights["w"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_w"
else:
tmp_param_attr = gate_param_attr
self.rg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in reset_gate_weights and reset_gate_weights[
"h"] is not None:
self.rg_h = reset_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_h"
else:
tmp_param_attr = gate_param_attr
self.rg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in reset_gate_weights and reset_gate_weights[
"b"] is not None:
self.rg_b = reused_params["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_reset_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.rg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# cell parameters
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = gate_param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None:
self.c_h = cell_weights["h"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = gate_param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = gate_bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, state): def forward(self, input, state):
if self.use_customized_weight:
rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
_gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
_candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
_gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
_candidate_bias = self.c_b
else:
_gate_weight = self._gate_weight
_gate_bias = self._gate_bias
_candidate_weight = self._candidate_weight
_candidate_bias = self._candidate_bias
pre_hidden = state pre_hidden = state
concat_input_hidden = layers.concat([input, pre_hidden], axis=1) concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
gate_input = layers.elementwise_add(gate_input, self._gate_bias) gate_input = layers.elementwise_add(gate_input, _gate_bias)
gate_input = self._gate_activation(gate_input) gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1) r, u = layers.split(gate_input, num_or_sections=2, dim=1)
...@@ -371,8 +756,8 @@ class BasicGRUCell(RNNCell): ...@@ -371,8 +756,8 @@ class BasicGRUCell(RNNCell):
r_hidden = r * pre_hidden r_hidden = r * pre_hidden
candidate = layers.matmul( candidate = layers.matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight) layers.concat([input, r_hidden], 1), _candidate_weight)
candidate = layers.elementwise_add(candidate, self._candidate_bias) candidate = layers.elementwise_add(candidate, _candidate_bias)
c = self._activation(candidate) c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c new_hidden = u * pre_hidden + (1 - u) * c
...@@ -700,7 +1085,11 @@ class PrePostProcessLayer(Layer): ...@@ -700,7 +1085,11 @@ class PrePostProcessLayer(Layer):
PrePostProcessLayer PrePostProcessLayer
""" """
def __init__(self, process_cmd, d_model, dropout_rate): def __init__(self,
process_cmd,
d_model,
dropout_rate,
reused_layer_norm=None):
super(PrePostProcessLayer, self).__init__() super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd self.process_cmd = process_cmd
self.functors = [] self.functors = []
...@@ -708,16 +1097,21 @@ class PrePostProcessLayer(Layer): ...@@ -708,16 +1097,21 @@ class PrePostProcessLayer(Layer):
if cmd == "a": # add residual connection if cmd == "a": # add residual connection
self.functors.append(lambda x, y: x + y if y else x) self.functors.append(lambda x, y: x + y if y else x)
elif cmd == "n": # add layer normalization elif cmd == "n": # add layer normalization
if reused_layer_norm is not None:
layer_norm = reused_layer_norm
else:
layer_norm = LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
self.functors.append( self.functors.append(
self.add_sublayer( self.add_sublayer(
"layer_norm_%d" % len( "layer_norm_%d" % len(
self.sublayers(include_sublayers=False)), self.sublayers(include_sublayers=False)),
LayerNorm( layer_norm))
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))))
elif cmd == "d": # add dropout elif cmd == "d": # add dropout
self.functors.append(lambda x: layers.dropout( self.functors.append(lambda x: layers.dropout(
x, dropout_prob=dropout_rate, is_test=False) x, dropout_prob=dropout_rate, is_test=False)
...@@ -737,21 +1131,48 @@ class MultiHeadAttention(Layer): ...@@ -737,21 +1131,48 @@ class MultiHeadAttention(Layer):
Multi-Head Attention Multi-Head Attention
""" """
def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): def __init__(self,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.0,
reused_query_fc=None,
reused_key_fc=None,
reused_value_fc=None,
reused_proj_fc=None):
super(MultiHeadAttention, self).__init__() super(MultiHeadAttention, self).__init__()
self.n_head = n_head self.n_head = n_head
self.d_key = d_key self.d_key = d_key
self.d_value = d_value self.d_value = d_value
self.d_model = d_model self.d_model = d_model
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.q_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) if reused_query_fc is not None:
self.k_fc = Linear( self.q_fc = reused_query_fc
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) else:
self.v_fc = Linear( self.q_fc = Linear(
input_dim=d_model, output_dim=d_value * n_head, bias_attr=False) input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.proj_fc = Linear( if reused_key_fc is not None:
input_dim=d_value * n_head, output_dim=d_model, bias_attr=False) self.k_fc = reused_key_fc
else:
self.k_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
if reused_value_fc is not None:
self.v_fc = reused_value_fc
else:
self.v_fc = Linear(
input_dim=d_model,
output_dim=d_value * n_head,
bias_attr=False)
if reused_proj_fc is not None:
self.proj_fc = reused_proj_fc
else:
self.proj_fc = Linear(
input_dim=d_value * n_head,
output_dim=d_model,
bias_attr=False)
def _prepare_qkv(self, queries, keys, values, cache=None): def _prepare_qkv(self, queries, keys, values, cache=None):
if keys is None: # self-attention if keys is None: # self-attention
...@@ -828,12 +1249,24 @@ class FFN(Layer): ...@@ -828,12 +1249,24 @@ class FFN(Layer):
Feed-Forward Network Feed-Forward Network
""" """
def __init__(self, d_inner_hid, d_model, dropout_rate): def __init__(self,
d_inner_hid,
d_model,
dropout_rate,
fc1_act="relu",
reused_fc1=None,
reused_fc2=None):
super(FFN, self).__init__() super(FFN, self).__init__()
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.fc1 = Linear( if reused_fc1 is not None:
input_dim=d_model, output_dim=d_inner_hid, act="relu") self.fc1 = reused_fc1
self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) else:
self.fc1 = Linear(
input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
if reused_fc2 is not None:
self.fc2 = reused_fc2
else:
self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
def forward(self, x): def forward(self, x):
hidden = self.fc1(x) hidden = self.fc1(x)
...@@ -859,22 +1292,52 @@ class TransformerEncoderLayer(Layer): ...@@ -859,22 +1292,52 @@ class TransformerEncoderLayer(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da"): postprocess_cmd="da",
ffn_fc1_act="relu",
reused_pre_selatt_layernorm=None,
reused_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerEncoderLayer, self).__init__() super(TransformerEncoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, reused_pre_selatt_layernorm)
attention_dropout) self.self_attn = MultiHeadAttention(
self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, d_key,
prepostprocess_dropout) d_value,
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
self.ffn = FFN(d_inner_hid, d_model, relu_dropout) reused_pre_ffn_layernorm)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
fc1_act=ffn_fc1_act,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
reused_post_ffn_layernorm)
def forward(self, enc_input, attn_bias): def forward(self, enc_input, attn_bias):
attn_output = self.self_attn( attn_output = self.self_attn(
...@@ -902,7 +1365,8 @@ class TransformerEncoder(Layer): ...@@ -902,7 +1365,8 @@ class TransformerEncoder(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da"): postprocess_cmd="da",
ffn_fc1_act="relu"):
super(TransformerEncoder, self).__init__() super(TransformerEncoder, self).__init__()
...@@ -912,9 +1376,17 @@ class TransformerEncoder(Layer): ...@@ -912,9 +1376,17 @@ class TransformerEncoder(Layer):
self.add_sublayer( self.add_sublayer(
"layer_%d" % i, "layer_%d" % i,
TransformerEncoderLayer( TransformerEncoderLayer(
n_head, d_key, d_value, d_model, d_inner_hid, n_head,
prepostprocess_dropout, attention_dropout, d_key,
relu_dropout, preprocess_cmd, postprocess_cmd))) d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
ffn_fc1_act=ffn_fc1_act)))
self.processer = PrePostProcessLayer(preprocess_cmd, d_model, self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout)
...@@ -941,28 +1413,79 @@ class TransformerDecoderLayer(Layer): ...@@ -941,28 +1413,79 @@ class TransformerDecoderLayer(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da"): postprocess_cmd="da",
reused_pre_selfatt_layernorm=None,
reused_self_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_crossatt_layernorm=None,
reused_cross_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_crossatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerDecoderLayer, self).__init__() super(TransformerDecoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, reused_pre_selfatt_layernorm)
attention_dropout) self.self_attn = MultiHeadAttention(
self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, d_key,
prepostprocess_dropout) d_value,
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_self_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_self_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, reused_pre_crossatt_layernorm)
attention_dropout) self.cross_attn = MultiHeadAttention(
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, d_key,
prepostprocess_dropout) d_value,
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_cross_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_cross_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_cross_multihead_att_weights[
"reused_proj_fc"])
self.postprocesser2 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_crossatt_layernorm)
self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
self.ffn = FFN(d_inner_hid, d_model, relu_dropout) reused_pre_ffn_layernorm)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout,
reused_post_ffn_layernorm)
def forward(self, def forward(self,
dec_input, dec_input,
...@@ -1031,7 +1554,7 @@ class TransformerDecoder(Layer): ...@@ -1031,7 +1554,7 @@ class TransformerDecoder(Layer):
] ]
#TODO: we should merge GRUCell with BasicGRUCell
class GRUCell(RNNCell): class GRUCell(RNNCell):
def __init__(self, def __init__(self,
input_size, input_size,
...@@ -1044,9 +1567,7 @@ class GRUCell(RNNCell): ...@@ -1044,9 +1567,7 @@ class GRUCell(RNNCell):
super(GRUCell, self).__init__() super(GRUCell, self).__init__()
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.fc_layer = Linear( self.fc_layer = Linear(
input_size, input_size, hidden_size * 3, param_attr=param_attr)
hidden_size * 3,
param_attr=param_attr)
self.gru_unit = GRUUnit( self.gru_unit = GRUUnit(
hidden_size * 3, hidden_size * 3,
...@@ -1067,7 +1588,8 @@ class GRUCell(RNNCell): ...@@ -1067,7 +1588,8 @@ class GRUCell(RNNCell):
return [self.hidden_size] return [self.hidden_size]
class GRUEncoderCell(RNNCell): #TODO: we should merge GRUCell with BasicGRUCell
class GRUEncoderCell(RNNCell):
def __init__(self, def __init__(self,
num_layers, num_layers,
input_size, input_size,
...@@ -1086,8 +1608,9 @@ class GRUEncoderCell(RNNCell): ...@@ -1086,8 +1608,9 @@ class GRUEncoderCell(RNNCell):
GRUCell( GRUCell(
input_size=input_size if i == 0 else hidden_size, input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size, hidden_size=hidden_size,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer( param_attr=fluid.ParamAttr(
low=-init_scale, high=init_scale))))) initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))))
def forward(self, step_input, states): def forward(self, step_input, states):
new_states = [] new_states = []
...@@ -1109,18 +1632,17 @@ class GRUEncoderCell(RNNCell): ...@@ -1109,18 +1632,17 @@ class GRUEncoderCell(RNNCell):
class BiGRU(fluid.dygraph.Layer): class BiGRU(fluid.dygraph.Layer):
def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None): def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
super(BiGRU, self).__init__() super(BiGRU, self).__init__()
self.gru = RNN(GRUEncoderCell(1, input_dim, self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
grnn_hidden_dim, 0.0, init_bound), init_bound),
is_reverse=False, is_reverse=False,
time_major=False) time_major=False)
self.gru_r = RNN(GRUEncoderCell(1, input_dim, self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
grnn_hidden_dim, 0.0, init_bound), init_bound),
is_reverse=True, is_reverse=True,
time_major=False) time_major=False)
def forward(self, input_feature): def forward(self, input_feature):
pre_gru, pre_state = self.gru(input_feature) pre_gru, pre_state = self.gru(input_feature)
gru_r, r_state = self.gru_r(input_feature) gru_r, r_state = self.gru_r(input_feature)
bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1) bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
...@@ -1320,14 +1842,14 @@ class SequenceTagging(fluid.dygraph.Layer): ...@@ -1320,14 +1842,14 @@ class SequenceTagging(fluid.dygraph.Layer):
emission = self.fc(bigru_output) emission = self.fc(bigru_output)
if target is not None: if target is not None:
crf_cost = self.linear_chain_crf( crf_cost = self.linear_chain_crf(
input=emission, label=target, length=lengths) input=emission, label=target, length=lengths)
avg_cost = fluid.layers.mean(x=crf_cost) avg_cost = fluid.layers.mean(x=crf_cost)
self.crf_decoding.weight = self.linear_chain_crf.weight self.crf_decoding.weight = self.linear_chain_crf.weight
crf_decode = self.crf_decoding(input=emission, length=lengths) crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, avg_cost, lengths return crf_decode, avg_cost, lengths
else: else:
self.linear_chain_crf.weight = self.crf_decoding.weight self.linear_chain_crf.weight = self.crf_decoding.weight
crf_decode = self.crf_decoding(input=emission, length=lengths) crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, lengths return crf_decode, lengths
...@@ -17,24 +17,15 @@ from . import vgg ...@@ -17,24 +17,15 @@ from . import vgg
from . import mobilenetv1 from . import mobilenetv1
from . import mobilenetv2 from . import mobilenetv2
from . import darknet from . import darknet
from . import yolov3
from . import tsm
from . import bmn_model
from .resnet import * from .resnet import *
from .mobilenetv1 import * from .mobilenetv1 import *
from .mobilenetv2 import * from .mobilenetv2 import *
from .vgg import * from .vgg import *
from .darknet import * from .darknet import *
from .yolov3 import *
from .tsm import *
from .bmn_model import *
__all__ = resnet.__all__ \ __all__ = resnet.__all__ \
+ vgg.__all__ \ + vgg.__all__ \
+ mobilenetv1.__all__ \ + mobilenetv1.__all__ \
+ mobilenetv2.__all__ \ + mobilenetv2.__all__ \
+ darknet.__all__ \ + darknet.__all__
+ yolov3.__all__ \
+ tsm.__all__ \
+ bmn_model.__all__
...@@ -22,7 +22,7 @@ from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Pool2D, Linear ...@@ -22,7 +22,7 @@ from paddle.fluid.dygraph.nn import Conv2D, BatchNorm, Pool2D, Linear
from hapi.model import Model from hapi.model import Model
from hapi.download import get_weights_path from hapi.download import get_weights_path
__all__ = ['DarkNet', 'ConvBNLayer', 'darknet53'] __all__ = ['DarkNet', 'darknet53']
# {num_layers: (url, md5)} # {num_layers: (url, md5)}
pretrain_infos = { pretrain_infos = {
......
...@@ -12,6 +12,11 @@ ...@@ -12,6 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from . import transforms
from . import functional
from .transforms import * from .transforms import *
from .functional import * from .functional import *
from .detection_transforms import *
__all__ = transforms.__all__ \
+ functional.__all__
...@@ -26,6 +26,8 @@ else: ...@@ -26,6 +26,8 @@ else:
Sequence = collections.abc.Sequence Sequence = collections.abc.Sequence
Iterable = collections.abc.Iterable Iterable = collections.abc.Iterable
__all__ = ['flip', 'resize']
def flip(image, code): def flip(image, code):
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册