diff --git a/demo/text_matching/predict.py b/demo/text_matching/predict.py index 6fe6a42a6cc314b45c4d0aaa7c5ab07c6649e270..f4d171558c5e9a665d44c3a8709437928966e2eb 100644 --- a/demo/text_matching/predict.py +++ b/demo/text_matching/predict.py @@ -25,7 +25,7 @@ if __name__ == '__main__': model = hub.Module( name='ernie_tiny', - version='2.0.1', + version='2.0.2', task='text-matching', load_checkpoint='./checkpoint/best_model/model.pdparams', label_map=label_map) diff --git a/demo/text_matching/train.py b/demo/text_matching/train.py index 7770b3c0b266dbadb74d1ae7bf5e4e84446d9bf1..ddbebb32b5fc9a7fab09e90c1a78025560eb78bf 100644 --- a/demo/text_matching/train.py +++ b/demo/text_matching/train.py @@ -31,7 +31,7 @@ parser.add_argument("--save_interval", type=int, default=2, help="Save checkpoin args = parser.parse_args() if __name__ == '__main__': - model = hub.Module(name='ernie_tiny', version='2.0.1', task='text-matching') + model = hub.Module(name='ernie_tiny', version='2.0.2', task='text-matching') tokenizer = model.get_tokenizer() train_dataset = LCQMC(tokenizer=tokenizer, max_seq_len=args.max_seq_len, mode='train') diff --git a/modules/text/language_model/bert-base-cased/README.md b/modules/text/language_model/bert-base-cased/README.md index de01d9689393528bf9a21015a7f8a22933de098a..3f8c657d003055393dc9446c288f59d112014745 100644 --- a/modules/text/language_model/bert-base-cased/README.md +++ b/modules/text/language_model/bert-base-cased/README.md @@ -164,3 +164,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-base-cased/module.py b/modules/text/language_model/bert-base-cased/module.py index 8b7b75d592938818a67d12a68d750018b66c9215..af74ab5f5c9a1ed69e7b98825caba6d4af0d4709 100644 --- a/modules/text/language_model/bert-base-cased/module.py +++ b/modules/text/language_model/bert-base-cased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-base-cased", - version="2.0.1", + version="2.0.2", summary= "bert_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -72,6 +72,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-cased', **kwargs) else: @@ -85,8 +91,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -109,6 +135,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-base-chinese/README.md b/modules/text/language_model/bert-base-chinese/README.md index 41a8e0c125993ca53e49bd063bdfaebf314b15ca..52fb33a61f0734b4a9c8abd611a1a765078a8ae8 100644 --- a/modules/text/language_model/bert-base-chinese/README.md +++ b/modules/text/language_model/bert-base-chinese/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-base-chinese/module.py b/modules/text/language_model/bert-base-chinese/module.py index bb8cca195c38281bb71252d0a703e1a9a385d6ca..443e56b175d863eae2dd7f1b50c45d863db554c7 100644 --- a/modules/text/language_model/bert-base-chinese/module.py +++ b/modules/text/language_model/bert-base-chinese/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-base-chinese", - version="2.0.1", + version="2.0.2", summary= "bert_chinese_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-chinese', **kwargs) else: @@ -93,8 +99,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-base-multilingual-cased/README.md b/modules/text/language_model/bert-base-multilingual-cased/README.md index 14a2def03909a4c12bb190cb7ae4a438e6f0eb51..bd3355a8012eb5b6dd9a8938869c85d9bc3eca7d 100644 --- a/modules/text/language_model/bert-base-multilingual-cased/README.md +++ b/modules/text/language_model/bert-base-multilingual-cased/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-base-multilingual-cased/module.py b/modules/text/language_model/bert-base-multilingual-cased/module.py index 124a0ce4ff563d3e72b6cb58b6fbe003ede51f73..e1c79e94dd17e64e17dda61d04c12234dcc84c6c 100644 --- a/modules/text/language_model/bert-base-multilingual-cased/module.py +++ b/modules/text/language_model/bert-base-multilingual-cased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-base-multilingual-cased", - version="2.0.1", + version="2.0.2", summary= "bert_multi_cased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-cased', **kwargs) else: @@ -93,8 +99,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-base-multilingual-uncased/README.md b/modules/text/language_model/bert-base-multilingual-uncased/README.md index 3d07c2130a7f3381c63f8f40edadd62e8f0d661d..8ff208805bb1658073ab39aa0b5b97e52acb31b1 100644 --- a/modules/text/language_model/bert-base-multilingual-uncased/README.md +++ b/modules/text/language_model/bert-base-multilingual-uncased/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-base-multilingual-uncased/module.py b/modules/text/language_model/bert-base-multilingual-uncased/module.py index c957d7e3ef22655e63f0d3298cd39b2c19fe4076..82fc6a4b09b30ad872c8bd89053a3a1ebb158ec8 100644 --- a/modules/text/language_model/bert-base-multilingual-uncased/module.py +++ b/modules/text/language_model/bert-base-multilingual-uncased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-base-multilingual-uncased", - version="2.0.1", + version="2.0.2", summary= "bert_multi_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-multilingual-uncased', **kwargs) else: @@ -93,8 +99,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-base-uncased/README.md b/modules/text/language_model/bert-base-uncased/README.md index 84867e57c002527ee33a8d821489faa445499244..0d42f276cb69c76d834650532c1b51cb44ab1c11 100644 --- a/modules/text/language_model/bert-base-uncased/README.md +++ b/modules/text/language_model/bert-base-uncased/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-base-uncased/module.py b/modules/text/language_model/bert-base-uncased/module.py index 8c06ad3424a29828ddc08d7be7e8edec962b5003..876d1a52ecf1277c60d69c9c0643013304c1d393 100644 --- a/modules/text/language_model/bert-base-uncased/module.py +++ b/modules/text/language_model/bert-base-uncased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-base-uncased", - version="2.0.1", + version="2.0.2", summary= "bert_uncased_L-12_H-768_A-12, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -72,6 +72,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased', **kwargs) else: @@ -85,8 +91,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -109,6 +135,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-large-cased/README.md b/modules/text/language_model/bert-large-cased/README.md index 1cd6285fca959172b524336ab2adbf9715982c83..376e9fbc574247552e7f94e1c07446f962008c50 100644 --- a/modules/text/language_model/bert-large-cased/README.md +++ b/modules/text/language_model/bert-large-cased/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-large-cased/module.py b/modules/text/language_model/bert-large-cased/module.py index d456b78faec8ba53e4e50ae59ed8cd7f9afd089c..b60eae2692f726edcc82c60dd523f3020de53ebf 100644 --- a/modules/text/language_model/bert-large-cased/module.py +++ b/modules/text/language_model/bert-large-cased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-large-cased", - version="2.0.1", + version="2.0.2", summary= "bert_cased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -72,6 +72,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-cased', **kwargs) else: @@ -85,8 +91,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -109,6 +135,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/bert-large-uncased/README.md b/modules/text/language_model/bert-large-uncased/README.md index 1a40ce6a99c668e8531cc6f73c830bd3bfbdd29a..072bfcba01aad281094007e52333e200a9289e70 100644 --- a/modules/text/language_model/bert-large-uncased/README.md +++ b/modules/text/language_model/bert-large-uncased/README.md @@ -163,3 +163,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/bert-large-uncased/module.py b/modules/text/language_model/bert-large-uncased/module.py index cedcba1d2a192539551c6f16b7a37d2c60aa9a05..026d8188602f4ac63c0bba02ac8dce33cee98dc6 100644 --- a/modules/text/language_model/bert-large-uncased/module.py +++ b/modules/text/language_model/bert-large-uncased/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="bert-large-uncased", - version="2.0.1", + version="2.0.2", summary= "bert_uncased_L-24_H-1024_A-16, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -72,6 +72,12 @@ class Bert(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-large-uncased', **kwargs) else: @@ -85,8 +91,28 @@ class Bert(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -109,6 +135,35 @@ class Bert(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/chinese_bert_wwm/README.md b/modules/text/language_model/chinese_bert_wwm/README.md index 61eabad78d61b63ea552a9fd565606369c4477ec..734a203e917322e13e435ad445dd7e2f4e2fb8fe 100644 --- a/modules/text/language_model/chinese_bert_wwm/README.md +++ b/modules/text/language_model/chinese_bert_wwm/README.md @@ -156,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/chinese_bert_wwm/module.py b/modules/text/language_model/chinese_bert_wwm/module.py index b225bb4eb49938b47b0cb8fa895efab0aa962e82..259f8b42e05b5fae395ff5fab526f16e081dd609 100644 --- a/modules/text/language_model/chinese_bert_wwm/module.py +++ b/modules/text/language_model/chinese_bert_wwm/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="chinese-bert-wwm", - version="2.0.0", + version="2.0.1", summary= "chinese-bert-wwm, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -81,6 +81,12 @@ class BertWwm(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-chinese', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-chinese', **kwargs) else: @@ -94,8 +100,28 @@ class BertWwm(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -118,6 +144,35 @@ class BertWwm(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/chinese_bert_wwm_ext/README.md b/modules/text/language_model/chinese_bert_wwm_ext/README.md index 7a287a3067816bf9ddeed8db9d009edfe7dcb374..5aac1aee0b7109eb515648c4672faa705f3aad55 100644 --- a/modules/text/language_model/chinese_bert_wwm_ext/README.md +++ b/modules/text/language_model/chinese_bert_wwm_ext/README.md @@ -156,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/chinese_bert_wwm_ext/module.py b/modules/text/language_model/chinese_bert_wwm_ext/module.py index 2a4e825621b5b1972484108a0d8cbbb63aa468b3..8e6e8c6f616d1d28eb3e2270fc4a37691697ab17 100644 --- a/modules/text/language_model/chinese_bert_wwm_ext/module.py +++ b/modules/text/language_model/chinese_bert_wwm_ext/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="chinese-bert-wwm-ext", - version="2.0.0", + version="2.0.1", summary= "chinese-bert-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -81,6 +81,12 @@ class BertWwm(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-ext-chinese', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-wwm-ext-chinese', **kwargs) else: @@ -94,8 +100,28 @@ class BertWwm(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -118,6 +144,35 @@ class BertWwm(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/chinese_electra_base/README.md b/modules/text/language_model/chinese_electra_base/README.md index b3dfff0c998907231f97ae556dd2e5b5c135cd32..ea96ff3961ecc9de542b3df63bf0fe846e9bcec5 100644 --- a/modules/text/language_model/chinese_electra_base/README.md +++ b/modules/text/language_model/chinese_electra_base/README.md @@ -155,3 +155,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/chinese_electra_base/module.py b/modules/text/language_model/chinese_electra_base/module.py index 338c6605c04473ca385e2818d3a3724dc15e7ef9..84e320391824590cd04427621b86268172241a86 100644 --- a/modules/text/language_model/chinese_electra_base/module.py +++ b/modules/text/language_model/chinese_electra_base/module.py @@ -28,7 +28,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="chinese-electra-base", - version="2.0.0", + version="2.0.1", summary= "chinese-electra-base, 12-layer, 768-hidden, 12-heads, 102M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -80,6 +80,12 @@ class Electra(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='chinese-electra-base', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='chinese-electra-base', **kwargs) else: @@ -93,8 +99,28 @@ class Electra(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Electra(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(query_token_embedding.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(title_token_embedding.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/chinese_electra_small/README.md b/modules/text/language_model/chinese_electra_small/README.md index 4a5f0e99e6e6f07b3c1051e8ba9c36c7f7e3d9ad..620ae041fa4ceea6a75e3c76d89d7649c72f65fa 100644 --- a/modules/text/language_model/chinese_electra_small/README.md +++ b/modules/text/language_model/chinese_electra_small/README.md @@ -155,3 +155,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/chinese_electra_small/module.py b/modules/text/language_model/chinese_electra_small/module.py index 763f7d4f070f6ffb047d589a40af552132f49a20..03401f0101443b2c4bf5820e81aa6e42cd4a2d2c 100644 --- a/modules/text/language_model/chinese_electra_small/module.py +++ b/modules/text/language_model/chinese_electra_small/module.py @@ -28,7 +28,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="chinese-electra-small", - version="2.0.0", + version="2.0.1", summary= "chinese-electra-small, 12-layer, 256-hidden, 4-heads, 12M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -80,6 +80,12 @@ class Electra(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='chinese-electra-small', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='chinese-electra-small', **kwargs) else: @@ -93,8 +99,28 @@ class Electra(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Electra(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(query_token_embedding.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(title_token_embedding.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/electra_base/README.md b/modules/text/language_model/electra_base/README.md index df076cc085979a1e0549c28092df2a29d039990a..cee97e1bf695772729bd7081d563d5d295334dd6 100644 --- a/modules/text/language_model/electra_base/README.md +++ b/modules/text/language_model/electra_base/README.md @@ -151,3 +151,7 @@ paddlehub >= 2.0.0 * 1.0.0 初始发布,动态图版本模型,支持文本分类`seq-cls`和序列标注`token-cls`任务的fine-tune + +* 1.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/electra_base/module.py b/modules/text/language_model/electra_base/module.py index 1cfd62ffb9a4a6a93872b90ecc08eec616a5db93..6ef01fbb265205e9bb4b05d68ad181d2ef44c99f 100644 --- a/modules/text/language_model/electra_base/module.py +++ b/modules/text/language_model/electra_base/module.py @@ -28,7 +28,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="electra-base", - version="1.0.0", + version="1.0.1", summary= "electra-base, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Electra(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-base', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-base', **kwargs) else: @@ -93,8 +99,28 @@ class Electra(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Electra(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(query_token_embedding.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(title_token_embedding.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/electra_large/README.md b/modules/text/language_model/electra_large/README.md index 81f931d8442d29cd4c09008f4075b522e78531e5..4619d1327df9ee776716b0dd364a6377b836c1cd 100644 --- a/modules/text/language_model/electra_large/README.md +++ b/modules/text/language_model/electra_large/README.md @@ -151,3 +151,7 @@ paddlehub >= 2.0.0 * 1.0.0 初始发布,动态图版本模型,支持文本分类`seq-cls`和序列标注`token-cls`任务的fine-tune + +* 1.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/electra_large/module.py b/modules/text/language_model/electra_large/module.py index ae11788dcbe6ab0032e75aca0fc3f65e8ac59577..31efe69cec04e3ba0e6e6d55a5155da306340edf 100644 --- a/modules/text/language_model/electra_large/module.py +++ b/modules/text/language_model/electra_large/module.py @@ -28,7 +28,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="electra-large", - version="1.0.0", + version="1.0.1", summary= "electra-large, 24-layer, 1024-hidden, 16-heads, 335M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Electra(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-large', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-large', **kwargs) else: @@ -93,8 +99,28 @@ class Electra(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Electra(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(query_token_embedding.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(title_token_embedding.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/electra_small/README.md b/modules/text/language_model/electra_small/README.md index 65ec7548447b1b28cb960a644eaafb2a7a1c6e5b..6d90d9d9187c693a476cb57841de736a4b9b8f24 100644 --- a/modules/text/language_model/electra_small/README.md +++ b/modules/text/language_model/electra_small/README.md @@ -151,3 +151,7 @@ paddlehub >= 2.0.0 * 1.0.0 初始发布,动态图版本模型,支持文本分类`seq-cls`和序列标注`token-cls`任务的fine-tune + +* 1.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/electra_small/module.py b/modules/text/language_model/electra_small/module.py index ad60dd88035704a6a674ec291d0585a6d4a43548..697411f3fb44925ce6b466dc4ca335329532c847 100644 --- a/modules/text/language_model/electra_small/module.py +++ b/modules/text/language_model/electra_small/module.py @@ -28,7 +28,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="electra-small", - version="1.0.0", + version="1.0.1", summary= "electra-small, 12-layer, 256-hidden, 4-heads, 14M parameters. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class Electra(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-small', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained(pretrained_model_name_or_path='electra-small', **kwargs) else: @@ -93,8 +99,28 @@ class Electra(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Electra(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(query_token_embedding.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(title_token_embedding.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/ernie/README.md b/modules/text/language_model/ernie/README.md index 1a42cc99e0f0b07f449105f98a5b5ce41c3a4596..cd2ad5ffb1e8ff54085b099852c7681e33166fa5 100644 --- a/modules/text/language_model/ernie/README.md +++ b/modules/text/language_model/ernie/README.md @@ -184,3 +184,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/ernie/module.py b/modules/text/language_model/ernie/module.py index 37a995009c42d8f98a6089925ed7ec7e7e4a4a6c..ecb2e40e14a01ab0228662b76a36004253f29b99 100644 --- a/modules/text/language_model/ernie/module.py +++ b/modules/text/language_model/ernie/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="ernie", - version="2.0.1", + version="2.0.2", summary= "Baidu's ERNIE, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -72,6 +72,12 @@ class Ernie(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-1.0', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-1.0', **kwargs) else: @@ -85,8 +91,28 @@ class Ernie(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -109,6 +135,35 @@ class Ernie(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/ernie_tiny/README.md b/modules/text/language_model/ernie_tiny/README.md index 04ace50ced480b26039fd7df1a39cff1e69452f5..03fc0acdb14179bdc800a3a1a3551b1ae0f05705 100644 --- a/modules/text/language_model/ernie_tiny/README.md +++ b/modules/text/language_model/ernie_tiny/README.md @@ -176,3 +176,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/ernie_v2_eng_base/README.md b/modules/text/language_model/ernie_v2_eng_base/README.md index 3f747302915c7565d2578e8a591967d891361e9d..818471c2fc94cf1cc7c7cbd03fe58ca5eccd0108 100644 --- a/modules/text/language_model/ernie_v2_eng_base/README.md +++ b/modules/text/language_model/ernie_v2_eng_base/README.md @@ -172,3 +172,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/ernie_v2_eng_base/module.py b/modules/text/language_model/ernie_v2_eng_base/module.py index 59ea31b7a91ef9f3336a1c15d75d950c215de52f..fd5a6033d8a3b41b58e021991682386c1ad0822f 100644 --- a/modules/text/language_model/ernie_v2_eng_base/module.py +++ b/modules/text/language_model/ernie_v2_eng_base/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="ernie_v2_eng_base", - version="2.0.1", + version="2.0.2", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class ErnieV2(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-en', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-en', **kwargs) else: @@ -93,8 +99,28 @@ class ErnieV2(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class ErnieV2(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/ernie_v2_eng_large/README.md b/modules/text/language_model/ernie_v2_eng_large/README.md index 08c100f15244c743f4bb5dd0caf883512a548b20..d11edff18449e31fc7a72200465c9db6c6f5968a 100644 --- a/modules/text/language_model/ernie_v2_eng_large/README.md +++ b/modules/text/language_model/ernie_v2_eng_large/README.md @@ -171,3 +171,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/ernie_v2_eng_large/module.py b/modules/text/language_model/ernie_v2_eng_large/module.py index 0d54a6706986e536db288811d9fadf41888573eb..5be8965d89945586e814cc1940148312e0d34eb6 100644 --- a/modules/text/language_model/ernie_v2_eng_large/module.py +++ b/modules/text/language_model/ernie_v2_eng_large/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="ernie_v2_eng_large", - version="2.0.1", + version="2.0.2", summary= "Baidu's ERNIE 2.0, Enhanced Representation through kNowledge IntEgration, max_seq_len=512 when predtrained. The module is executed as paddle.dygraph.", author="paddlepaddle", @@ -80,6 +80,12 @@ class ErnieV2(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-large-en', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = ErnieModel.from_pretrained(pretrained_model_name_or_path='ernie-2.0-large-en', **kwargs) else: @@ -93,8 +99,28 @@ class ErnieV2(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class ErnieV2(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/rbt3/README.md b/modules/text/language_model/rbt3/README.md index 89d6928942614869ec5b40b129c1c731930e5f36..ccc7674425ca46498a0869b913b7ab34b73f882a 100644 --- a/modules/text/language_model/rbt3/README.md +++ b/modules/text/language_model/rbt3/README.md @@ -156,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/rbt3/module.py b/modules/text/language_model/rbt3/module.py index 63d2b5db848e330a1b0482e0fc49fd3f100b4cf7..8c8fe78902c5e5135cdb92726c3163eb9c890dd0 100644 --- a/modules/text/language_model/rbt3/module.py +++ b/modules/text/language_model/rbt3/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="rbt3", - version="2.0.0", + version="2.0.1", summary="rbt3, 3-layer, 768-hidden, 12-heads, 38M parameters ", author="ymcui", author_email="ymcui@ir.hit.edu.cn", @@ -80,6 +80,12 @@ class Roberta(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbt3', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbt3', **kwargs) else: @@ -93,8 +99,28 @@ class Roberta(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Roberta(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/rbtl3/README.md b/modules/text/language_model/rbtl3/README.md index 80b1c67e8d6eeb228a8347f102591350cb829bab..de1e475acdf4643382f441706b8a7c4dc3fa17b5 100644 --- a/modules/text/language_model/rbtl3/README.md +++ b/modules/text/language_model/rbtl3/README.md @@ -156,3 +156,7 @@ paddlehub >= 2.0.0 * 2.0.0 全面升级动态图,接口有所变化。任务名称调整,增加序列标注任务`token-cls` + +* 2.0.1 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/rbtl3/module.py b/modules/text/language_model/rbtl3/module.py index ac00a9a5d4a161ca88c785e4444a60e9799d1e82..e2d357954dd7a6ddc347023ba7a3f403eedf7256 100644 --- a/modules/text/language_model/rbtl3/module.py +++ b/modules/text/language_model/rbtl3/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="rbtl3", - version="2.0.0", + version="2.0.1", summary="rbtl3, 3-layer, 1024-hidden, 16-heads, 61M parameters ", author="ymcui", author_email="ymcui@ir.hit.edu.cn", @@ -80,6 +80,12 @@ class Roberta(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbtl3', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='rbtl3', **kwargs) else: @@ -93,8 +99,28 @@ class Roberta(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -117,6 +143,35 @@ class Roberta(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/roberta-wwm-ext-large/README.md b/modules/text/language_model/roberta-wwm-ext-large/README.md index 0b5f46cafb5e8c7f670fd236bad15d5b5d797141..74629300eabc3b10af2b2c59be2ca13929c2afcb 100644 --- a/modules/text/language_model/roberta-wwm-ext-large/README.md +++ b/modules/text/language_model/roberta-wwm-ext-large/README.md @@ -158,3 +158,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/roberta-wwm-ext-large/module.py b/modules/text/language_model/roberta-wwm-ext-large/module.py index aa45811d3adc201288cc5c836d2027b584a2d065..74a4c4714cec329ae5061e244ea82c69b5a52afb 100644 --- a/modules/text/language_model/roberta-wwm-ext-large/module.py +++ b/modules/text/language_model/roberta-wwm-ext-large/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="roberta-wwm-ext-large", - version="2.0.1", + version="2.0.2", summary= "chinese-roberta-wwm-ext-large, 24-layer, 1024-hidden, 16-heads, 340M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -81,6 +81,12 @@ class Roberta(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext-large', **kwargs) else: @@ -94,8 +100,28 @@ class Roberta(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -118,6 +144,35 @@ class Roberta(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output diff --git a/modules/text/language_model/roberta-wwm-ext/README.md b/modules/text/language_model/roberta-wwm-ext/README.md index 2eedd7d3e5deffdf5a9d2eacf7c6919136269c84..19db19d8de1b6fa5f8760a98b7165e0fe5a3e2af 100644 --- a/modules/text/language_model/roberta-wwm-ext/README.md +++ b/modules/text/language_model/roberta-wwm-ext/README.md @@ -158,3 +158,7 @@ paddlehub >= 2.0.0 * 2.0.1 任务名称调整,增加序列标注任务`token-cls` + +* 2.0.2 + + 增加文本匹配任务`text-matching` \ No newline at end of file diff --git a/modules/text/language_model/roberta-wwm-ext/module.py b/modules/text/language_model/roberta-wwm-ext/module.py index 8fa2bbe77ec3871c1e4168349a05a436af89f69f..844ff23ddfc664b546cfe25867065441ed7ea0d8 100644 --- a/modules/text/language_model/roberta-wwm-ext/module.py +++ b/modules/text/language_model/roberta-wwm-ext/module.py @@ -29,7 +29,7 @@ from paddlehub.utils.log import logger @moduleinfo( name="roberta-wwm-ext", - version="2.0.1", + version="2.0.2", summary= "chinese-roberta-wwm-ext, 12-layer, 768-hidden, 12-heads, 110M parameters. The module is executed as paddle.dygraph.", author="ymcui", @@ -81,6 +81,12 @@ class Roberta(nn.Layer): self.metric = ChunkEvaluator( label_list=[self.label_map[i] for i in sorted(self.label_map.keys())] ) + elif task == 'text-matching': + self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext', **kwargs) + self.dropout = paddle.nn.Dropout(0.1) + self.classifier = paddle.nn.Linear(self.model.config['hidden_size']*3, 2) + self.criterion = paddle.nn.loss.CrossEntropyLoss() + self.metric = paddle.metric.Accuracy() elif task is None: self.model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-wwm-ext', **kwargs) else: @@ -94,8 +100,28 @@ class Roberta(nn.Layer): self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint)) - def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, seq_lengths=None, labels=None): - result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + def forward(self, + input_ids=None, + token_type_ids=None, + position_ids=None, + attention_mask=None, + query_input_ids=None, + query_token_type_ids=None, + query_position_ids=None, + query_attention_mask=None, + title_input_ids=None, + title_token_type_ids=None, + title_position_ids=None, + title_attention_mask=None, + seq_lengths=None, + labels=None): + + if self.task != 'text-matching': + result = self.model(input_ids, token_type_ids, position_ids, attention_mask) + else: + query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) + title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) + if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) @@ -118,6 +144,35 @@ class Roberta(nn.Layer): _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs + elif self.task == 'text-matching': + query_token_embedding, _ = query_result + query_token_embedding = self.dropout(query_token_embedding) + query_attention_mask = paddle.unsqueeze( + (query_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + query_token_embedding = query_token_embedding * query_attention_mask + query_sum_embedding = paddle.sum(query_token_embedding, axis=1) + query_sum_mask = paddle.sum(query_attention_mask, axis=1) + query_mean = query_sum_embedding / query_sum_mask + + title_token_embedding, _ = title_result + title_token_embedding = self.dropout(title_token_embedding) + title_attention_mask = paddle.unsqueeze( + (title_input_ids != self.model.pad_token_id).astype(self.model.pooler.dense.weight.dtype), axis=2) + title_token_embedding = title_token_embedding * title_attention_mask + title_sum_embedding = paddle.sum(title_token_embedding, axis=1) + title_sum_mask = paddle.sum(title_attention_mask, axis=1) + title_mean = title_sum_embedding / title_sum_mask + + sub = paddle.abs(paddle.subtract(query_mean, title_mean)) + projection = paddle.concat([query_mean, title_mean, sub], axis=-1) + logits = self.classifier(projection) + probs = F.softmax(logits) + if labels is not None: + loss = self.criterion(logits, labels) + correct = self.metric.compute(probs, labels) + acc = self.metric.update(correct) + return probs, loss, {'acc': acc} + return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output