“87f9191e4202e050da174981aa0d9b1fa3cd0f51”上不存在“paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp”
提交 7c600b66 编写于 作者: K kinghuin 提交者: wuzewu

Merdev (#143)

* Add Reading Comprehension Taskwq

* Add Reading Comprehension Task

* Add Reading Comprehension Task

* dd Reading Comprehension Task

* Add the reading comprehension task

* add reading comprehension task

* Add reading comprehension

* Add reading comprehension task

* add reading comprehension task

* Add reading comprehension task

* Add reading comprehension task

* Fix tokenization for bert chn

* Add GLUE and XNLI, modify text-classification demo test=develop (#94)

* Add GLUE and XNLI, modify text-classification demo

* add hub_server rw lock

* Support GLUE (#108)

* Support GLUE

* Support MNLI_m and MNLI_mm

* restore checkpoint.py

* Modify for review

* Fix the bug whether module is valid or not

* Enhancetask (#122)

* accelerate predict

* Add autoft (#127)

* add autofinetune

* update paddlehub required libs

* Add copyright

* Split task and add regression demo  (#130)

* split task

* add regression demo

* implement CombinedStrategy (#128)

* implement CombinedStrategy

* optimize some details (#131)

* optimize some details

* optimize some details (#133)

* support cv_reader

* use logger instead of print

* add warning

* remove todo

* add gradual_unfreeze frz_blocks

* replace the visualization toolkit from visualdl to tb-paddle (#139)

* replace the logging toolkit from visualdl to tb-paddle.

* modified:   requirements.txt

* modified:   paddlehub/finetune/task/basic_task.py

* update autofinetune  (#135)

* update autofinetune (add modelbased evaluator)

* support cpu count

* fix ci
上级 e5cb716b
......@@ -10,13 +10,13 @@ import numpy as np
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning.")
parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.")
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.")
parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
# yapf: enable.
module_map = {
......
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(f1_score, prediction,
ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
if __name__ == '__main__':
expected_version = '1.1'
parser = argparse.ArgumentParser(
description='Evaluation for SQuAD ' + expected_version)
parser.add_argument('dataset_file', help='Dataset file')
parser.add_argument('prediction_file', help='Prediction File')
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
if (dataset_json['version'] != expected_version):
print(
'Evaluation expects v-' + expected_version +
', but got dataset with v-' + dataset_json['version'],
file=sys.stderr)
dataset = dataset_json['data']
print(args.prediction_file)
with open(args.prediction_file) as prediction_file:
predictions = json.load(prediction_file)
print(json.dumps(evaluate(dataset, predictions)))
"""Official evaluation script for SQuAD version 2.0.
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys
def make_qid_to_has_ans(dataset):
qid_to_has_ans = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid_to_has_ans[qa['id']] = bool(qa['answers'])
return qid_to_has_ans
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
return re.sub(regex, ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s: return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(dataset, preds):
exact_scores = {}
f1_scores = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid = qa['id']
gold_answers = [
a['text'] for a in qa['answers']
if normalize_answer(a['text'])
]
if not gold_answers:
# For unanswerable questions, only correct answer is empty string
gold_answers = ['']
if qid not in preds:
print('Missing prediction for %s' % qid)
continue
a_pred = preds[qid]
# Take max over all gold answers
exact_scores[qid] = max(
compute_exact(a, a_pred) for a in gold_answers)
f1_scores[qid] = max(
compute_f1(a, a_pred) for a in gold_answers)
return exact_scores, f1_scores
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
new_scores = {}
for qid, s in scores.items():
pred_na = na_probs[qid] > na_prob_thresh
if pred_na:
new_scores[qid] = float(not qid_to_has_ans[qid])
else:
new_scores[qid] = s
return new_scores
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
if not qid_list:
total = len(exact_scores)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores.values()) / total),
('f1', 100.0 * sum(f1_scores.values()) / total),
('total', total),
])
else:
total = len(qid_list)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
('total', total),
])
def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs,
qid_to_has_ans):
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs,
qid_to_has_ans)
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs,
qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores:
continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
return 100.0 * best_score / len(scores), best_thresh
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import collections
import json
import io
import math
import numpy as np
import os
import six
import sys
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
import evaluate_v1
import evaluate_v2
hub.common.logger.logger.setLevel("INFO")
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--learning_rate", type=float, default=4e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint.")
parser.add_argument("--result_dir", type=str, default=None, help="Directory to predicted results to be written.")
parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.")
parser.add_argument("--n_best_size", type=int, default=20, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
args = parser.parse_args()
# yapf: enable.
def write_predictions(
all_examples,
all_features,
all_results,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
n_best_size=20,
max_answer_length=30,
do_lower_case=True,
version_2_with_negative=False,
null_score_diff_threshold=0.0,
):
"""Write final predictions to the json file and log-odds of null if needed."""
print("Writing predictions to: %s" % (output_prediction_file))
print("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = get_best_indexes(result.start_logits, n_best_size)
end_indexes = get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
print("Emmm..., sth wrong")
probs = compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
def get_final_text(pred_text, orig_text, do_lower_case):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = hub.reader.tokenization.BasicTokenizer(
do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
if __name__ == '__main__':
# Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
# module = hub.Module(module_dir=["./bert_uncased_L-12_H-768_A-12.hub_module"])
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset
dataset = hub.dataset.SQUAD(
version_2_with_negative=args.version_2_with_negative)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_length=args.max_seq_len,
doc_stride=128,
max_query_length=64)
# Use "sequence_output" for token-level output.
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
warmup_proportion=args.warmup_proportion,
lr_scheduler="linear_decay")
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
log_interval=10,
use_pyreader=args.use_pyreader,
use_data_parallel=args.use_data_parallel,
save_ckpt_interval=100,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
enable_memory_optim=True,
strategy=strategy)
# Define a reading comprehension finetune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
config=config)
# Data to be predicted
data = dataset.predict_examples
features = reader.convert_examples_to_features(
examples=data, is_training=False)
run_states = reading_comprehension_task.predict(data=data)
results = [run_state.run_results for run_state in run_states]
RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"])
all_results = []
for batch_idx, batch_result in enumerate(results):
np_unique_ids = batch_result[0]
np_start_logits = batch_result[1]
np_end_logits = batch_result[2]
np_num_seqs = batch_result[3]
for idx in range(np_unique_ids.shape[0]):
unique_id = int(np_unique_ids[idx])
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
output_prediction_file = os.path.join(args.result_dir, "predictions.json")
output_nbest_file = os.path.join(args.result_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.result_dir, "null_odds.json")
write_predictions(
data,
features,
all_results,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
max_answer_length=args.max_answer_length,
n_best_size=args.n_best_size,
version_2_with_negative=args.version_2_with_negative,
null_score_diff_threshold=args.null_score_diff_threshold)
with io.open(dataset.predict_file, 'r', encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
with io.open(
output_prediction_file, 'r', encoding="utf8") as prediction_file:
predictions = json.load(prediction_file)
if not args.version_2_with_negative:
print(json.dumps(evaluate_v1.evaluate(dataset, predictions)))
else:
with io.open(
output_null_log_odds_file, 'r', encoding="utf8") as odds_file:
na_probs = json.load(odds_file)
# Maps qid to true/false
qid_to_has_ans = evaluate_v2.make_qid_to_has_ans(dataset)
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = evaluate_v2.get_raw_scores(dataset, predictions)
exact_thresh = evaluate_v2.apply_no_ans_threshold(
exact_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
f1_thresh = evaluate_v2.apply_no_ans_threshold(
f1_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
out_eval = evaluate_v2.make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
has_ans_eval = evaluate_v2.make_eval_dict(
exact_thresh, f1_thresh, qid_list=has_ans_qids)
evaluate_v2.merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
no_ans_eval = evaluate_v2.make_eval_dict(
exact_thresh, f1_thresh, qid_list=no_ans_qids)
evaluate_v2.merge_eval(out_eval, no_ans_eval, 'NoAns')
evaluate_v2.find_all_best_thresh(out_eval, predictions, exact_raw,
f1_raw, na_probs, qid_to_has_ans)
print(json.dumps(out_eval, indent=4))
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
hub.common.logger.logger.setLevel("INFO")
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset
dataset = hub.dataset.SQUAD(
version_2_with_negative=args.version_2_with_negative)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_length=args.max_seq_len,
doc_stride=128,
max_query_length=64)
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
warmup_proportion=args.warmup_proportion,
lr_scheduler="linear_decay")
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
log_interval=10,
use_pyreader=args.use_pyreader,
use_data_parallel=args.use_data_parallel,
save_ckpt_interval=1000,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
enable_memory_optim=True,
strategy=strategy)
# Define a reading comprehension finetune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
config=config)
# Finetune by PaddleHub's API
reading_comprehension_task.finetune()
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0,1
python -u reading_comprehension.py \
--batch_size=12 \
--use_gpu=True \
--checkpoint_dir="./ckpt_rc" \
--learning_rate=3e-5 \
--weight_decay=0.01 \
--warmup_proportion=0.1 \
--num_epoch=2 \
--max_seq_len=384 \
--use_pyreader=True \
--use_data_parallel=True \
--version_2_with_negative=False
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_rc"
RES_DIR="./result"
mkdir $RES_DIR
python -u predict.py \
--batch_size=12 \
--use_gpu=True \
--checkpoint_dir=${CKPT_DIR} \
--learning_rate=3e-5 \
--weight_decay=0.01 \
--warmup_proportion=0.1 \
--num_epoch=1 \
--max_seq_len=384 \
--use_pyreader=False \
--use_data_parallel=False \
--version_2_with_negative=False \
--result_dir=${RES_DIR}
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
dataset = None
metrics_choices = []
# Download dataset and use ClassifyReader to read dataset
if args.dataset.lower() == "sts-b":
dataset = hub.dataset.GLUE("STS-B")
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
else:
raise ValueError("%s dataset is not defined" % args.dataset)
support_metrics = ["acc", "f1", "matthews"]
for metric in metrics_choices:
if metric not in support_metrics:
raise ValueError("\"%s\" metric is not defined" % metric)
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_data_parallel=False,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu,
batch_size=args.batch_size,
enable_memory_optim=False,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
# Define a regression finetune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config)
# Data to be prdicted
data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()]
index = 0
run_states = reg_task.predict(data=data)
results = [run_state.run_results for run_state in run_states]
if not os.path.exists("output"):
os.makedirs("output")
fout = open(os.path.join("output", "%s.tsv" % args.dataset.upper()), 'w')
fout.write("index\tprediction")
for batch_result in results:
for result in batch_result[0]:
if index < 3:
print("%s\t%s\tpredict=%.3f" % (data[index][0], data[index][1],
result[0]))
fout.write("\n%s\t%.3f" % (index, result[0]))
index += 1
fout.close()
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
dataset = None
# Download dataset and use ClassifyReader to read dataset
if args.dataset.lower() == "sts-b":
dataset = hub.dataset.GLUE("STS-B")
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
else:
raise ValueError("%s dataset is not defined" % args.dataset)
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
lr_scheduler="linear_decay")
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Define a regression finetune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
reg_task.finetune_and_eval()
export FLAGS_eager_delete_tensor_gb=0.0
# export CUDA_VISIBLE_DEVICES=0
# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task
DATASET="STS-B"
CKPT_DIR="./ckpt_${DATASET}"
# STS-B: batch_size=32, max_seq_len=128
python -u predict.py --checkpoint_dir $CKPT_DIR \
--max_seq_len 128 \
--use_gpu True \
--dataset=${DATASET} \
--batch_size=32 \
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
DATASET="STS-B"
CKPT_DIR="./ckpt_${DATASET}"
# Recommending hyper parameters for difference task
# STS-B: batch_size=32, weight_decay=0.1, num_epoch=3, max_seq_len=128, lr=4e-5
python -u regression.py \
--batch_size=32 \
--use_gpu=True \
--dataset=${DATASET} \
--checkpoint_dir=${CKPT_DIR} \
--learning_rate=4e-5 \
--weight_decay=0.1 \
--max_seq_len=128 \
--num_epoch=3 \
--use_pyreader=True \
--use_data_parallel=True \
......@@ -29,7 +29,7 @@ import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint_dir", type=str, default="ckpt_20190802182531", help="Directory to model checkpoint")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False")
......@@ -42,64 +42,89 @@ args = parser.parse_args()
if __name__ == '__main__':
dataset = None
metrics_choices = []
# Download dataset and use ClassifyReader to read dataset
if args.dataset.lower() == "chnsenticorp":
dataset = hub.dataset.ChnSentiCorp()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "nlpcc_dbqa":
dataset = hub.dataset.NLPCC_DBQA()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "lcqmc":
dataset = hub.dataset.LCQMC()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mrpc":
dataset = hub.dataset.GLUE("MRPC")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["f1", "acc"]
# The first metric will be choose to eval. Ref: task.py:799
elif args.dataset.lower() == "qqp":
dataset = hub.dataset.GLUE("QQP")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["f1", "acc"]
elif args.dataset.lower() == "sst-2":
dataset = hub.dataset.GLUE("SST-2")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "cola":
dataset = hub.dataset.GLUE("CoLA")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["matthews", "acc"]
elif args.dataset.lower() == "qnli":
dataset = hub.dataset.GLUE("QNLI")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "rte":
dataset = hub.dataset.GLUE("RTE")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
elif args.dataset.lower() == "mnli":
dataset = hub.dataset.GLUE("MNLI")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli_m":
dataset = hub.dataset.GLUE("MNLI_m")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mnli_mm":
dataset = hub.dataset.GLUE("MNLI_mm")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower().startswith("xnli"):
dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:])
module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
metrics_choices = ["acc"]
else:
raise ValueError("%s dataset is not defined" % args.dataset)
support_metrics = ["acc", "f1", "matthews"]
for metric in metrics_choices:
if metric not in support_metrics:
raise ValueError("\"%s\" metric is not defined" % metric)
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
......@@ -147,7 +172,8 @@ if __name__ == '__main__':
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
config=config,
metrics_choices=metrics_choices)
# Data to be prdicted
data = [[d.text_a, d.text_b] for d in dataset.get_dev_examples()[:3]]
......
......@@ -2,18 +2,32 @@ export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task
DATASET="chnsenticorp"
CKPT_DIR="./ckpt_${DATASET}"
# Support ChnSentiCorp NLPCC_DBQA LCQMC MRPC QQP SST-2
# CoLA QNLI RTE MNLI XNLI
# for XNLI: Specify the language with an underscore like xnli_zh.
# ar: Arabic bg: Bulgarian de: German
# el: Greek en: English es: Spanish
# fr: French hi: Hindi ru: Russian
# sw: Swahili th: Thai tr: Turkish
# ur: Urdu vi: Vietnamese zh: Chinese (Simplified)
# Recommending hyper parameters for difference task
# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5
# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5
# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5
# QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# CoLA: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# MRPC: batch_size=32, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5
# RTE: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=3e-5
# MNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# Specify the matched/mismatched dev and test dataset with an underscore.
# mnli_m or mnli: dev and test in matched dataset.
# mnli_mm: dev and test in mismatched dataset.
# The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf.
# If you are not sure which one to pick, just use mnli or mnli_m.
# XNLI: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5
# Specify the language with an underscore like xnli_zh.
# ar- Arabic bg- Bulgarian de- German
# el- Greek en- English es- Spanish
# fr- French hi- Hindi ru- Russian
# sw- Swahili th- Thai tr- Turkish
# ur- Urdu vi- Vietnamese zh- Chinese (Simplified)
python -u text_classifier.py \
--batch_size=24 \
......
......@@ -2,17 +2,20 @@ export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task
DATASET="chnsenticorp"
CKPT_DIR="./ckpt_${DATASET}"
# Support ChnSentiCorp NLPCC_DBQA LCQMC MRPC QQP SST-2
# CoLA QNLI RTE MNLI XNLI
# CoLA QNLI RTE MNLI (or MNLI_m) MNLI_mm) XNLI
# for XNLI: Specify the language with an underscore like xnli_zh.
# ar: Arabic bg: Bulgarian de: German
# el: Greek en: English es: Spanish
# fr: French hi: Hindi ru: Russian
# sw: Swahili th: Thai tr: Turkish
# ur: Urdu vi: Vietnamese zh: Chinese (Simplified)
DATASET="ChnSentiCorp"
CKPT_DIR="./ckpt_${DATASET}"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False --dataset=${DATASET} --use_taskid False
python -u predict.py --checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \
--use_gpu=True \
--dataset=${DATASET} \
--batch_size=150 \
--use_taskid=False \
......@@ -26,7 +26,7 @@ parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whet
parser.add_argument("--dataset", type=str, default="chnsenticorp", help="The choice of dataset")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy")
parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
......@@ -39,64 +39,89 @@ args = parser.parse_args()
if __name__ == '__main__':
dataset = None
metrics_choices = []
# Download dataset and use ClassifyReader to read dataset
if args.dataset.lower() == "chnsenticorp":
dataset = hub.dataset.ChnSentiCorp()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "nlpcc_dbqa":
dataset = hub.dataset.NLPCC_DBQA()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "lcqmc":
dataset = hub.dataset.LCQMC()
module = hub.Module(name="ernie")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mrpc":
dataset = hub.dataset.GLUE("MRPC")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["f1", "acc"]
# The first metric will be choose to eval. Ref: task.py:799
elif args.dataset.lower() == "qqp":
dataset = hub.dataset.GLUE("QQP")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["f1", "acc"]
elif args.dataset.lower() == "sst-2":
dataset = hub.dataset.GLUE("SST-2")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "cola":
dataset = hub.dataset.GLUE("CoLA")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["matthews", "acc"]
elif args.dataset.lower() == "qnli":
dataset = hub.dataset.GLUE("QNLI")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "rte":
dataset = hub.dataset.GLUE("RTE")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
elif args.dataset.lower() == "mnli":
dataset = hub.dataset.GLUE("MNLI")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli":
dataset = hub.dataset.GLUE("MNLI_m")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower() == "mnli_mm":
dataset = hub.dataset.GLUE("MNLI_mm")
if args.use_taskid:
module = hub.Module(name="ernie_v2_eng_base")
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
metrics_choices = ["acc"]
elif args.dataset.lower().startswith("xnli"):
dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:])
module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
metrics_choices = ["acc"]
else:
raise ValueError("%s dataset is not defined" % args.dataset)
support_metrics = ["acc", "f1", "matthews"]
for metric in metrics_choices:
if metric not in support_metrics:
raise ValueError("\"%s\" metric is not defined" % metric)
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
......@@ -144,7 +169,8 @@ if __name__ == '__main__':
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
config=config,
metrics_choices=metrics_choices)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -50,7 +50,12 @@ from .finetune.task import TextClassifierTask
from .finetune.task import ImageClassifierTask
from .finetune.task import SequenceLabelTask
from .finetune.task import MultiLabelClassifierTask
from .finetune.task import RegressionTask
from .finetune.task import ReadingComprehensionTask
from .finetune.config import RunConfig
from .finetune.strategy import AdamWeightDecayStrategy
from .finetune.strategy import DefaultStrategy
from .finetune.strategy import DefaultFinetuneStrategy
from .finetune.strategy import L2SPFinetuneStrategy
from .finetune.strategy import ULMFiTStrategy
from .finetune.strategy import CombinedStrategy
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from multiprocessing.pool import ThreadPool
import copy
import json
import math
import numpy as np
import six
import time
from paddlehub.common.logger import logger
from paddlehub.common.utils import mkdir
if six.PY3:
INF = math.inf
else:
INF = float("inf")
class PSHE2(object):
def __init__(
self,
evaluator,
cudas=["0"],
popsize=5,
output_dir=None,
alpha=0.5,
epsilon=0.2,
):
self._num_thread = len(cudas)
self._popsize = popsize
self._alpha = alpha
self._epsilon = epsilon
self._iteration = 0
self.cudas = cudas
self.is_cuda_free = {"free": [], "busy": []}
self.is_cuda_free["free"] = cudas
self.evaluator = evaluator
self.init_input = evaluator.get_init_params()
self.num_hparm = len(self.init_input)
self.best_hparams_per_pop = [[0] * self.num_hparm] * self._popsize
self.best_reward_per_pop = [INF] * self._popsize
self.momentums = [[0] * self.num_hparm] * self._popsize
self.best_hparms_all_pop = []
self.best_reward_all_pop = INF
self.current_hparams = [[0] * self.num_hparm] * self._popsize
for i in range(self.popsize):
self.current_hparams[i] = self.randomSolution()
if output_dir is None:
now = int(time.time())
time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now))
self._output_dir = "output_" + time_str
else:
self._output_dir = output_dir
@property
def thread(self):
return self._num_thread
@property
def popsize(self):
return self._popsize
@property
def alpha(self):
return self._alpha
@property
def epsilon(self):
return self._epsilon
@property
def output_dir(self):
return self._output_dir
@property
def iteration(self):
return self._iteration
def set_output_dir(self, output_dir=None):
if output_dir is not None:
output_dir = output_dir
else:
output_dir = self._output_dir
return output_dir
def randomSolution(self):
solut = [0] * self.num_hparm
for i in range(self.num_hparm):
ratio = (np.random.random_sample() - 0.5) * 2.0
if ratio >= 0:
solut[i] = (
1.0 - self.init_input[i]) * ratio + self.init_input[i]
else:
solut[i] = (
self.init_input[i] + 1.0) * ratio + self.init_input[i]
return solut
def smallPeturb(self):
for i in range(self.popsize):
for j in range(self.num_hparm):
ratio = (np.random.random_sample() - 0.5) * 2.0
if ratio >= 0:
self.current_hparams[i][j] = (
1.0 - self.current_hparams[i][j]
) * ratio * self.epsilon + self.current_hparams[i][j]
else:
self.current_hparams[i][j] = (
self.current_hparams[i][j] +
1.0) * ratio * self.epsilon + self.current_hparams[i][j]
def estimatePopGradients(self):
gradients = [[0] * self.num_hparm] * self.popsize
for i in range(self.popsize):
for j in range(self.num_hparm):
gradients[i][j] = self.current_hparams[i][
j] - self.best_hparms_all_pop[j]
return gradients
def estimateLocalGradients(self):
gradients = [[0] * self.num_hparm] * self.popsize
for i in range(self.popsize):
for j in range(self.num_hparm):
gradients[i][j] = self.current_hparams[i][
j] - self.best_hparams_per_pop[i][j]
return gradients
def estimateMomemtum(self):
popGrads = self.estimatePopGradients()
localGrads = self.estimateLocalGradients()
for i in range(self.popsize):
for j in range(self.num_hparm):
self.momentums[i][j] = (
1 - 3.0 * self.alpha / self.iteration
) * self.momentums[i][j] - self.alpha * localGrads[i][
j] - self.alpha * popGrads[i][j]
def is_stop(self):
return False
def solutions(self):
return self.current_hparams
def feedback(self, params_list, reward_list):
self._iteration = self._iteration + 1
for i in range(self.popsize):
if reward_list[i] < self.best_reward_per_pop[i]:
self.best_hparams_per_pop[i] = copy.deepcopy(
self.current_hparams[i])
self.best_reward_per_pop[i] = reward_list[i]
if reward_list[i] < self.best_reward_all_pop:
self.best_hparms_all_pop = self.current_hparams[i]
self.best_reward_all_pop = reward_list[i]
self.estimateMomemtum()
for i in range(self.popsize):
for j in range(len(self.init_input)):
self.current_hparams[i][j] = self.current_hparams[i][
j] + self.alpha * self.momentums[i][j]
self.smallPeturb()
def optimal_solution(self):
return self.best_hparms_all_pop
def step(self, output_dir):
solutions = self.solutions()
params_cudas_dirs = []
solution_results = []
cnt = 0
solutions_ckptdirs = {}
mkdir(output_dir)
for idx, solution in enumerate(solutions):
cuda = self.is_cuda_free["free"][0]
ckptdir = output_dir + "/ckpt-" + str(idx)
log_file = output_dir + "/log-" + str(idx) + ".info"
params_cudas_dirs.append([solution, cuda, ckptdir, log_file])
solutions_ckptdirs[tuple(solution)] = ckptdir
self.is_cuda_free["free"].remove(cuda)
self.is_cuda_free["busy"].append(cuda)
if len(params_cudas_dirs) == self.thread or cnt == int(
self.popsize / self.thread):
tp = ThreadPool(len(params_cudas_dirs))
solution_results += tp.map(self.evaluator.run,
params_cudas_dirs)
cnt += 1
tp.close()
tp.join()
for param_cuda in params_cudas_dirs:
self.is_cuda_free["free"].append(param_cuda[1])
self.is_cuda_free["busy"].remove(param_cuda[1])
params_cudas_dirs = []
self.feedback(solutions, solution_results)
return solutions_ckptdirs
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import io
import hashlib
import math
import os
import random
import six
import yaml
from paddlehub.common.logger import logger
from paddlehub.common.utils import is_windows
REWARD_SUM = 10000
if six.PY3:
INF = math.inf
else:
INF = float("inf")
class BaseEvaluator(object):
def __init__(self, params_file, finetunee_script):
with io.open(params_file, 'r', encoding='utf8') as f:
self.params = yaml.safe_load(f)
self.finetunee_script = finetunee_script
def get_init_params(self):
init_params = []
for param in self.params["param_list"]:
init_params.append(param['init_value'])
init_params = self.inverse_convert_params(init_params)
return init_params
def get_reward(self, result_output):
return REWARD_SUM - float(result_output)
def is_valid_params(self, params):
for i in range(0, len(self.params["param_list"])):
if params[i] < float(self.params["param_list"][i]["greater_than"]):
return False
if params[i] > float(self.params["param_list"][i]["lower_than"]):
return False
return True
def convert_params(self, params):
cparams = []
for i in range(0, len(self.params["param_list"])):
cparams.append(
float(self.params["param_list"][i]["greater_than"] +
(params[i] + 1.0) / 2.0 *
(self.params["param_list"][i]["lower_than"] -
self.params["param_list"][i]["greater_than"])))
if cparams[i] <= float(
self.params["param_list"][i]["greater_than"]):
cparams[i] = float(self.params["param_list"][i]["greater_than"])
if cparams[i] >= float(self.params["param_list"][i]["lower_than"]):
cparams[i] = float(self.params["param_list"][i]["lower_than"])
if self.params["param_list"][i]["type"] == "int":
cparams[i] = int(cparams[i])
return cparams
def inverse_convert_params(self, params):
cparams = []
for i in range(0, len(self.params["param_list"])):
cparams.append(
float(
-1.0 + 2.0 *
(params[i] - self.params["param_list"][i]["greater_than"]) /
(self.params["param_list"][i]["lower_than"] -
self.params["param_list"][i]["greater_than"])))
if cparams[i] <= -1.0:
cparams[i] = -1.0
if cparams[i] >= 1.0:
cparams[i] = 1.0
return cparams
def format_params_str(self, params):
param_str = "--%s=%s" % (self.params["param_list"][0]["name"],
params[0])
for i in range(1, len(self.params["param_list"])):
param_str = "%s --%s=%s" % (
param_str, self.params["param_list"][i]["name"], str(params[i]))
return param_str
def run(self, *args):
raise NotImplementedError
def new_round(self):
pass
class FullTrailEvaluator(BaseEvaluator):
def __init__(self, params_file, finetunee_script):
super(FullTrailEvaluator, self).__init__(params_file, finetunee_script)
def run(self, *args):
params = args[0][0]
num_cuda = args[0][1]
ckpt_dir = args[0][2]
log_file = args[0][3]
params = self.convert_params(params)
if not self.is_valid_params(params):
return REWARD_SUM
param_str = self.format_params_str(params)
f = open(log_file, "w")
f.close()
if is_windows():
run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file)
else:
run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file)
try:
os.system(run_cmd)
with open(log_file, "r") as f:
lines = f.readlines()
eval_result = lines[-1]
except:
print(
"WARNING: Program which was ran with hyperparameters as %s was crashed!"
% param_str.replace("--", ""))
eval_result = 0.0
reward = self.get_reward(eval_result)
self.model_rewards[ckpt_dir] = reward
return reward
class ModelBasedEvaluator(BaseEvaluator):
def __init__(self, params_file, finetunee_script):
super(ModelBasedEvaluator, self).__init__(params_file, finetunee_script)
self.model_rewards = {}
self.half_best_model_ckpt = []
self.run_count = 0
def run(self, *args):
params = args[0][0]
num_cuda = args[0][1]
ckpt_dir = args[0][2]
log_file = args[0][3]
params = self.convert_params(params)
if not self.is_valid_params(params):
return REWARD_SUM
param_str = self.format_params_str(params)
f = open(log_file, "w")
f.close()
if len(self.half_best_model_ckpt) > 0:
model_path = self.half_best_model_ckpt[self.run_count % len(
self.half_best_model_ckpt)] + "/best_model"
if is_windows():
run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --epochs=1 --model_path %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, model_path, ckpt_dir, param_str, log_file)
else:
run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --epochs=1 --model_path %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, model_path, ckpt_dir, param_str, log_file)
else:
if is_windows():
run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file)
else:
run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \
(num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file)
self.run_count += 1
try:
os.system(run_cmd)
with open(log_file, "r") as f:
lines = f.readlines()
eval_result = lines[-1]
except:
print(
"WARNING: Program which was ran with hyperparameters as %s was crashed!"
% param_str.replace("--", ""))
eval_result = 0.0
reward = self.get_reward(eval_result)
self.model_rewards[ckpt_dir] = reward
return reward
def new_round(self):
"""update self.half_best_model"""
half_size = int(len(self.model_rewards) / 2)
if half_size < 1:
half_size = 1
self.half_best_model_ckpt = list({
key
for key in sorted(
self.model_rewards, key=self.model_rewards.get, reverse=False)
[:half_size]
})
self.model_rewards = {}
......@@ -25,3 +25,4 @@ from . import help
from . import clear
from . import config
from . import hub
from . import autofinetune
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import io
import json
import os
import sys
import ast
import six
import pandas
import numpy as np
from paddlehub.commands.base_command import BaseCommand, ENTRY
from paddlehub.common.arg_helper import add_argument, print_arguments
from paddlehub.autofinetune.autoft import PSHE2
from paddlehub.autofinetune.evaluator import FullTrailEvaluator
from paddlehub.autofinetune.evaluator import ModelBasedEvaluator
from paddlehub.common.logger import logger
import paddlehub as hub
class AutoFineTuneCommand(BaseCommand):
name = "autofinetune"
def __init__(self, name):
super(AutoFineTuneCommand, self).__init__(name)
self.show_in_help = True
self.name = name
self.description = "Paddlehub helps to finetune a task by searching hyperparameters automatically."
self.parser = argparse.ArgumentParser(
description=self.__class__.__doc__,
prog='%s %s <task to be fintuned in python script>' % (ENTRY,
self.name),
usage='%(prog)s',
add_help=False)
self.module = None
def add_params_file_arg(self):
self.arg_params_to_be_searched_group.add_argument(
"--param_file",
type=str,
default=None,
required=True,
help=
"Hyperparameters to be searched in the yaml format. The number of hyperparameters searched must be greater than 1."
)
def add_autoft_config_arg(self):
self.arg_config_group.add_argument(
"--popsize", type=int, default=5, help="Population size")
self.arg_config_group.add_argument(
"--cuda",
type=ast.literal_eval,
default=['0'],
help="The list of gpu devices to be used")
self.arg_config_group.add_argument(
"--round", type=int, default=10, help="Number of searches")
self.arg_config_group.add_argument(
"--output_dir",
type=str,
default=None,
help="Directory to model checkpoint")
self.arg_config_group.add_argument(
"--evaluate_choice",
type=str,
default="fulltrail",
help="Choices: fulltrail or modelbased.")
def execute(self, argv):
if not argv:
print("ERROR: Please specify a script to be finetuned in python.\n")
self.help()
return False
self.fintunee_script = argv[0]
self.parser.prog = '%s %s %s' % (ENTRY, self.name, self.fintunee_script)
self.arg_params_to_be_searched_group = self.parser.add_argument_group(
title="Input options",
description="Hyperparameters to be searched.")
self.arg_config_group = self.parser.add_argument_group(
title="Autofinetune config options",
description=
"Autofintune configuration for controlling autofinetune behavior, not required"
)
self.add_params_file_arg()
self.add_autoft_config_arg()
if not argv[1:]:
self.help()
return False
self.args = self.parser.parse_args(argv[1:])
if self.args.evaluate_choice.lower() == "fulltrail":
evaluator = FullTrailEvaluator(self.args.param_file,
self.fintunee_script)
elif self.args.evaluate_choice.lower() == "modelbased":
evaluator = ModelBasedEvaluator(self.args.param_file,
self.fintunee_script)
else:
raise ValueError(
"The evaluate %s is not defined!" % self.args.evaluate_choice)
autoft = PSHE2(
evaluator,
cudas=self.args.cuda,
popsize=self.args.popsize,
output_dir=self.args.output_dir)
run_round_cnt = 0
solutions_ckptdirs = {}
print("PaddleHub Autofinetune starts.")
while (not autoft.is_stop()) and run_round_cnt < self.args.round:
print("PaddleHub Autofinetune starts round at %s." % run_round_cnt)
output_dir = autoft._output_dir + "/round" + str(run_round_cnt)
res = autoft.step(output_dir)
solutions_ckptdirs.update(res)
evaluator.new_round()
run_round_cnt = run_round_cnt + 1
print("PaddleHub Autofinetune ends.")
with open("./log_file.txt", "w") as f:
best_choice = evaluator.convert_params(autoft.optimal_solution())
print("The best hyperparameters:")
f.write("The best hyperparameters:\n")
param_name = []
for idx, param in enumerate(evaluator.params["param_list"]):
param_name.append(param["name"])
f.write(param["name"] + "\t:\t" + str(best_choice[idx]) + "\n")
print("%s : %s" % (param["name"], best_choice[idx]))
f.write("\n\n\n")
f.write("\t".join(param_name) + "\toutput_dir\n\n")
logger.info(
"The checkpont directory of programs ran with paramemters searched are saved as log_file.txt ."
)
print(
"The checkpont directory of programs ran with paramemters searched are saved as log_file.txt ."
)
for solution, ckptdir in solutions_ckptdirs.items():
param = evaluator.convert_params(solution)
param = [str(p) for p in param]
f.write("\t".join(param) + "\t" + ckptdir + "\n\n")
return True
command = AutoFineTuneCommand.instance()
......@@ -36,7 +36,6 @@ class InstallCommand(BaseCommand):
prog='%s %s <module_name>' % (ENTRY, name),
usage='%(prog)s',
add_help=False)
#TODO(wuzewu): add --upgrade option
def execute(self, argv):
if not argv:
......
......@@ -15,7 +15,6 @@
import os
# TODO: Change dir.py's filename, this naming rule is not qualified
USER_HOME = os.path.expanduser('~')
HUB_HOME = os.path.join(USER_HOME, ".paddlehub")
MODULE_HOME = os.path.join(HUB_HOME, "modules")
......
......@@ -77,7 +77,6 @@ class Downloader(object):
with open(file_name, 'wb') as f:
shutil.copyfileobj(r.raw, f)
else:
#TODO(ZeyuChen) upgrade to tqdm process
with open(file_name, 'wb') as f:
dl = 0
total_length = int(total_length)
......
......@@ -24,6 +24,7 @@ import requests
import json
import yaml
import random
import fcntl
from random import randint
from paddlehub.common import utils, srv_utils
......@@ -38,6 +39,9 @@ CACHE_TIME = 60 * 10
class HubServer(object):
def __init__(self, config_file_path=None):
LOCK_FILE = os.path.join(hub.HUB_HOME, '__LOCK__')
LOCK_FP = open(LOCK_FILE, 'a+')
fcntl.flock(LOCK_FP.fileno(), fcntl.LOCK_EX)
if not config_file_path:
config_file_path = os.path.join(hub.CONF_HOME, 'config.json')
if not os.path.exists(hub.CONF_HOME):
......@@ -53,6 +57,7 @@ class HubServer(object):
self.server_url = self.config['server_url']
self.request()
self._load_resource_list_file_if_valid()
LOCK_FP.close()
def get_server_url(self):
random.seed(int(time.time()))
......@@ -178,7 +183,6 @@ class HubServer(object):
self.resource_list_file['version'][index]
for index in resource_index_list
]
#TODO(wuzewu): version sort method
resource_version_list = sorted(resource_version_list)
if not version:
if not resource_version_list:
......
......@@ -83,7 +83,6 @@ def from_param_to_module_attr(param, module_attr):
module_attr.map.data['trainable'])
from_pyobj_to_module_attr(param.do_model_average,
module_attr.map.data['do_model_average'])
#TODO(wuzewu): don't save learning rate
from_pyobj_to_module_attr(param.optimize_attr,
module_attr.map.data['optimize_attr'])
from_pyobj_to_module_attr(
......
......@@ -117,7 +117,6 @@ def get_pykey(key, keyed_type):
return str(key)
#TODO(wuzewu): solving the problem of circular references
def from_pyobj_to_module_attr(pyobj, module_attr, obj_filter=None):
if obj_filter and obj_filter(pyobj):
return
......
......@@ -20,6 +20,7 @@ from .msra_ner import MSRA_NER
from .nlpcc_dbqa import NLPCC_DBQA
from .lcqmc import LCQMC
from .toxic import Toxic
from .squad import SQUAD
from .xnli import XNLI
from .glue import GLUE
......
......@@ -52,7 +52,6 @@ class ImageClassificationDataset(object):
return dataset_path
def _parse_data(self, data_path, shuffle=False, phase=None):
def _base_reader():
data = []
with open(data_path, "r") as file:
while True:
......@@ -67,8 +66,7 @@ class ImageClassificationDataset(object):
image_path = items[0]
if not os.path.isabs(image_path):
if self.base_path is not None:
image_path = os.path.join(self.base_path,
image_path)
image_path = os.path.join(self.base_path, image_path)
label = items[-1]
data.append((image_path, items[-1]))
......@@ -82,6 +80,7 @@ class ImageClassificationDataset(object):
if shuffle:
np.random.shuffle(data)
def _base_reader():
for item in data:
yield item
......
......@@ -39,11 +39,18 @@ class GLUE(HubDataset):
def __init__(self, sub_dataset='SST-2'):
# sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B
if sub_dataset not in [
'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B'
'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP',
'RTE', 'SST-2', 'STS-B'
]:
raise Exception(
sub_dataset +
" is not in GLUE benchmark. Please confirm the data set")
self.mismatch = False
if sub_dataset == 'MNLI_mm':
sub_dataset = 'MNLI'
self.mismatch = True
elif sub_dataset == 'MNLI_m':
sub_dataset = 'MNLI'
self.sub_dataset = sub_dataset
self.dataset_dir = os.path.join(DATA_HOME, "glue_data")
......@@ -64,9 +71,12 @@ class GLUE(HubDataset):
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
if self.sub_dataset == 'MNLI':
if self.sub_dataset == 'MNLI' and not self.mismatch:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev_matched.tsv")
elif self.sub_dataset == 'MNLI' and self.mismatch:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev_mismatched.tsv")
else:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev.tsv")
......@@ -76,9 +86,12 @@ class GLUE(HubDataset):
self.test_examples = []
def _load_predict_examples(self):
if self.sub_dataset == 'MNLI':
if self.sub_dataset == 'MNLI' and not self.mismatch:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test_matched.tsv")
elif self.sub_dataset == 'MNLI' and self.mismatch:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test_mismatched.tsv")
else:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test.tsv")
......@@ -187,7 +200,7 @@ class GLUE(HubDataset):
seq_id += 1
examples.append(example)
except:
print("[Discard Incorrect Data] " + "\t".join(line))
logger.info("[Discard Incorrect Data] " + "\t".join(line))
return examples
......
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
import json
import os
import sys
from paddlehub.reader import tokenization
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (tokenization.printable_text(
self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class SQUAD(object):
"""A single set of features of data."""
def __init__(self, version_2_with_negative=False):
self.dataset_dir = os.path.join(DATA_HOME, "squad_data")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples(version_2_with_negative, is_training=True)
self._load_predict_examples(version_2_with_negative, is_training=False)
def _load_train_examples(self,
version_2_with_negative=False,
is_training=True):
if not version_2_with_negative:
self.train_file = os.path.join(self.dataset_dir, "train-v1.1.json")
else:
self.train_file = os.path.join(self.dataset_dir, "train-v2.0.json")
self.train_examples = self._read_json(self.train_file, is_training,
version_2_with_negative)
def _load_predict_examples(self,
version_2_with_negative=False,
is_training=False):
if not version_2_with_negative:
self.predict_file = os.path.join(self.dataset_dir, "dev-v1.1.json")
else:
self.predict_file = os.path.join(self.dataset_dir, "dev-v2.0.json")
self.predict_examples = self._read_json(self.predict_file, is_training,
version_2_with_negative)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return []
def get_test_examples(self):
return []
def _read_json(self, input_file, is_training,
version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[
answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(
orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning(
"Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=True)
examples = ds.get_dev_examples()
for index, e in enumerate(examples):
if index < 10:
print(e)
......@@ -43,6 +43,7 @@ class XNLI(HubDataset):
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh"
]:
raise Exception(language +
"is not in XNLI. Please confirm the language")
self.language = language
......
......@@ -22,4 +22,5 @@ message CheckPoint {
int64 current_epoch = 1;
int64 global_step = 2;
string latest_model_dir = 3;
double best_score = 4;
}
......@@ -37,6 +37,7 @@ def load_checkpoint(checkpoint_dir, exe, main_program):
ckpt.ParseFromString(f.read())
current_epoch = 1
global_step = 0
best_score = -999
def if_exist(var):
return os.path.exists(os.path.join(ckpt.latest_model_dir, var.name))
......@@ -45,20 +46,27 @@ def load_checkpoint(checkpoint_dir, exe, main_program):
fluid.io.load_vars(
exe, ckpt.latest_model_dir, main_program, predicate=if_exist)
# Compatible with older versions without best_score in checkpoint_pb2
try:
best_score = ckpt.best_score
except:
best_score = -999
logger.info("PaddleHub model checkpoint loaded. current_epoch={}, "
"global_step={}".format(ckpt.current_epoch,
ckpt.global_step))
return True, ckpt.current_epoch, ckpt.global_step
"global_step={}, best_score={:.5f}".format(
ckpt.current_epoch, ckpt.global_step, best_score))
return True, ckpt.current_epoch, ckpt.global_step, best_score
logger.info(
"PaddleHub model checkpoint not found, start training from scratch...")
logger.info("PaddleHub model checkpoint not found, start from scratch...")
return False, current_epoch, global_step
return False, current_epoch, global_step, best_score
def save_checkpoint(checkpoint_dir,
current_epoch,
global_step,
best_score,
exe,
main_program=fluid.default_main_program()):
......@@ -73,5 +81,6 @@ def save_checkpoint(checkpoint_dir,
ckpt.current_epoch = current_epoch
ckpt.global_step = global_step
ckpt.latest_model_dir = model_saved_dir
ckpt.best_score = best_score
with open(ckpt_meta_path, "wb") as f:
f.write(ckpt.SerializeToString())
#coding:utf-8
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: checkpoint.proto
......@@ -18,7 +17,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
package='paddlehub.task.checkpoint',
syntax='proto3',
serialized_pb=_b(
'\n\x10\x63heckpoint.proto\x12\x19paddlehub.task.checkpoint\"R\n\nCheckPoint\x12\x15\n\rcurrent_epoch\x18\x01 \x01(\x03\x12\x13\n\x0bglobal_step\x18\x02 \x01(\x03\x12\x18\n\x10latest_model_dir\x18\x03 \x01(\tB\x02H\x03\x62\x06proto3'
'\n\x10\x63heckpoint.proto\x12\x19paddlehub.task.checkpoint\"f\n\nCheckPoint\x12\x15\n\rcurrent_epoch\x18\x01 \x01(\x03\x12\x13\n\x0bglobal_step\x18\x02 \x01(\x03\x12\x18\n\x10latest_model_dir\x18\x03 \x01(\t\x12\x12\n\nbest_score\x18\x04 \x01(\x01\x42\x02H\x03\x62\x06proto3'
))
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
......@@ -77,6 +76,22 @@ _CHECKPOINT = _descriptor.Descriptor(
is_extension=False,
extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='best_score',
full_name='paddlehub.task.checkpoint.CheckPoint.best_score',
index=3,
number=4,
type=1,
cpp_type=5,
label=1,
has_default_value=False,
default_value=float(0),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
options=None),
],
extensions=[],
nested_types=[],
......@@ -87,7 +102,7 @@ _CHECKPOINT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[],
serialized_start=47,
serialized_end=129,
serialized_end=149,
)
DESCRIPTOR.message_types_by_name['CheckPoint'] = _CHECKPOINT
......
......@@ -128,3 +128,75 @@ def calculate_f1(num_label, num_infer, num_correct):
else:
f1 = 2 * precision * recall / (precision + recall)
return precision, recall, f1
def calculate_f1_np(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == 1) & (preds == 1))
tn = np.sum((labels == 0) & (preds == 0))
fp = np.sum((labels == 0) & (preds == 1))
fn = np.sum((labels == 1) & (preds == 0))
p = tp / (tp + fp) if (tp + fp) else 0
r = tp / (tp + fn) if (tp + fn) else 0
f1 = (2 * p * r) / (p + r) if p + r else 0
return f1
def matthews_corrcoef(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == 1) & (preds == 1))
tn = np.sum((labels == 0) & (preds == 0))
fp = np.sum((labels == 0) & (preds == 1))
fn = np.sum((labels == 1) & (preds == 0))
div = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
mcc = ((tp * tn) - (fp * fn)) / np.sqrt(div) if div else 0
return mcc
def recall_nk(data, n, k, m):
'''
This metric can be used to evaluate whether the model can find the correct response B for question A
Note: Only applies to each question A only has one correct response B1.
Parameters
----------
data: List. Each element is a tuple, consist of the positive probability of the sample prediction and its label.
For each example, the only one true positive sample should be the first tuple.
n: int. The number of labels per example.
eg: [A,B1,1], [A,B2,0], [A,B3,0] n=3 as there has 3 labels for example A
k: int. If the top k is right, the example will be considered right.
eg: [A,B1,1]=0.5, [A,B2,0]=0.8, [A,B3,0]=0.3(Probability of 1)
If k=2, the prediction for the example A will be considered correct as 0.5 is the top2 Probability
If k=1, the prediction will be considered wrong as 0.5 is not the biggest probability.
m: int. For every m examples, there's going to be a positive sample.
eg. data= [A1,B1,1], [A1,B2,0], [A1,B3,0], [A2,B1,1], [A2,B2,0], [A2,B3,0]
For every 3 examples, there will be one positive sample. so m=3, and n can be 1,2 or 3.
'''
def get_p_at_n_in_m(data, n, k, ind):
"""
calculate precision in recall n
"""
pos_score = data[ind][0]
curr = data[ind:ind + n]
curr = sorted(curr, key=lambda x: x[0], reverse=True)
if curr[k - 1][0] <= pos_score:
return 1
return 0
correct_num = 0.0
length = len(data) // m
for i in range(0, length):
ind = i * m
assert data[ind][1] == 1
correct_num += get_p_at_n_in_m(data, n, k, ind)
return correct_num / length
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
from paddle.fluid.layers import control_flow
from paddlehub.common.logger import logger
def adam_weight_decay_optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
main_program,
weight_decay,
scheduler='linear_decay'):
if scheduler == 'noam_decay':
if warmup_steps > 0:
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
else:
logger.warning(
"Noam decay learning rate scheduler should have positive \
warmup steps, using constant learning rate instead!")
scheduled_lr = fluid.layers.create_global_var(
shape=[1],
value=learning_rate,
dtype='float32',
persistable=True,
name="learning_rate")
elif scheduler == 'linear_decay':
scheduled_lr = linear_warmup_decay(learning_rate, num_train_steps,
warmup_steps, main_program)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
clip_norm_thres = 1.0
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
for param in main_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps,
main_program):
with main_program._lr_schedule_guard():
global_step = lr_scheduler._decay_step_counter()
lr = fluid.layers.create_global_var(
shape=[1],
value=init_lr,
dtype='float32',
persistable=True,
name="learning_rate")
with control_flow.Switch() as switch:
with switch.case(global_step < num_warmup_steps):
decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps
fluid.layers.assign(decayed_lr, lr)
with switch.default():
decayed_lr = lr_scheduler.polynomial_decay(
learning_rate=init_lr,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.assign(decayed_lr, lr)
return lr
......@@ -18,12 +18,15 @@ from __future__ import division
from __future__ import print_function
import os
import math
import multiprocessing
import paddle.fluid as fluid
from paddlehub.finetune.optimization import adam_weight_decay_optimization
from paddlehub.common.logger import logger
from paddlehub.finetune.regularizer import L2SPDecayRegularizer
import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
from paddle.fluid.layers import control_flow
def get_pretrained_parameter(main_program, start_program):
......@@ -40,6 +43,99 @@ def get_pretrained_parameter(main_program, start_program):
return pretrained_parameters
def get_parentOp_depth_max(parent_ops, op_depth_dict):
max_depth = 1
for parent_op in parent_ops:
depth = op_depth_dict.get(parent_op, 1)
if max_depth < depth:
max_depth = depth
return max_depth
def get_opDepth_min(ops, op_depth_dict):
min_depth = max(op_depth_dict.values())
for op in ops:
depth = op_depth_dict[op]
if min_depth > depth:
min_depth = depth
return min_depth
def get_depth_parameter(main_program):
global_block = main_program.global_block()
var_op_dict = {}
for op in global_block.ops:
for input_arg in op.input_arg_names:
if input_arg not in var_op_dict.keys():
var_op_dict[input_arg] = {"output_ops": [], "input_ops": []}
var_op_dict[input_arg]["output_ops"].append(op)
for output_arg in op.output_arg_names:
if output_arg not in var_op_dict.keys():
var_op_dict[output_arg] = {"output_ops": [], "input_ops": []}
var_op_dict[output_arg]["input_ops"].append(op)
op_depth_dict = {}
for op in global_block.ops:
parent_ops = []
for input_arg in op.input_arg_names:
for parent_op in var_op_dict[input_arg]["input_ops"]:
if parent_op not in parent_ops:
parent_ops.append(parent_op)
if not parent_ops:
op_depth_dict[op] = 1
else:
op_depth_dict[op] = get_parentOp_depth_max(parent_ops,
op_depth_dict) + 1
depth_params_dict = {}
updated_depth_params_dict = {}
for param in global_block.iter_parameters():
adherent_ops = var_op_dict[param.name]["output_ops"]
depth = get_opDepth_min(adherent_ops, op_depth_dict)
if depth not in depth_params_dict.keys():
depth_params_dict[depth] = []
updated_depth_params_dict[depth] = []
depth_params_dict[depth].append(param)
updated_depth_params_dict[depth].append(param)
depth_list = sorted(depth_params_dict.keys())
len_depth_list = len(depth_list)
for index, depth in enumerate(depth_list):
for param in depth_params_dict[depth]:
prefix = param.name.split(".")[0]
if index < len_depth_list - 1:
next_depth = depth_list[index + 1]
for param_next_depth in depth_params_dict[next_depth]:
prefix_next_depth = param_next_depth.name.split(".")[0]
if prefix == prefix_next_depth:
updated_depth_params_dict[depth].append(
param_next_depth)
updated_depth_params_dict[next_depth].remove(
param_next_depth)
if not updated_depth_params_dict[next_depth]:
updated_depth_params_dict.pop(next_depth)
return updated_depth_params_dict
def set_gradual_unfreeze(main_program, unfreeze_depths):
depth_params_dict = get_depth_parameter(main_program)
for depth in unfreeze_depths:
for index, param in enumerate(depth_params_dict[depth]):
depth_params_dict[depth][index].stop_gradient = False
freeze_depths = list(
set(depth_params_dict.keys()).difference(set(unfreeze_depths)))
for depth in freeze_depths:
for index, param in enumerate(depth_params_dict[depth]):
depth_params_dict[depth][index].stop_gradient = True
class DefaultStrategy(object):
def __init__(self, learning_rate=1e-4, optimizer_name="adam"):
self.learning_rate = learning_rate
......@@ -75,133 +171,403 @@ class DefaultStrategy(object):
self.optimizer = fluid.optimizer.Adam(
learning_rate=self.learning_rate)
def execute(self, loss, data_reader, config):
def execute(self, loss, data_reader, config, dev_count):
if self.optimizer is not None:
self.optimizer.minimize(loss)
else:
raise ValueError("DefaultStrategy's optimizer is None")
# TODO complete __str__()
def __str__(self):
return "DefaultStrategy"
def step(self):
pass
class AdamWeightDecayStrategy(DefaultStrategy):
class CombinedStrategy(DefaultStrategy):
def __init__(self,
optimizer_name="adam",
learning_rate=1e-4,
lr_scheduler="linear_decay",
warmup_proportion=0.1,
weight_decay=0.01,
optimizer_name="adam"):
super(AdamWeightDecayStrategy, self).__init__(
learning_rate=learning_rate, optimizer_name=optimizer_name)
# check strategy correctness
if lr_scheduler not in ["linear_decay", "noam_decay"]:
raise ValueError("lr_scheduler {} is not setup "
"correctly".format(lr_scheduler))
self._lr_scheduler = lr_scheduler
self._warmup_proportion = warmup_proportion
self._weight_decay = weight_decay
@property
def lr_scheduler(self):
return self._lr_scheduler
@property
def warmup_proportion(self):
return self._warmup_proportion
@property
def weight_decay(self):
return self._weight_decay
def execute(self, loss, data_reader, config):
main_program = loss.block.program
# calculate wamrup step
dev_count = self._get_dev_count(config)
scheduler=None,
regularization=None,
clip=None):
super(CombinedStrategy, self).__init__(
optimizer_name=optimizer_name, learning_rate=learning_rate)
# init set
self.scheduler = {
"warmup": 0.0,
"linear_decay": {
"start_point": 1.0,
"end_learning_rate": 0.0,
},
"noam_decay": False,
"discriminative": {
"blocks": 0,
"factor": 2.6
},
"gradual_unfreeze": 0,
"slanted_triangle": {
"cut_fraction": 0.0,
"ratio": 32
}
}
self.regularization = {
"L2": 0.0,
"L2SP": 0.0,
"weight_decay": 0.0,
}
self.clip = {"GlobalNorm": 0.0, "Norm": 0.0}
if scheduler == None:
scheduler = {}
if regularization == None:
regularization = {}
if clip == None:
clip = {}
# check legality and assign
for name in scheduler:
self.check_assign(self.scheduler, name, scheduler[name])
for name in regularization:
self.check_assign(self.regularization, name, regularization[name])
for name in clip:
self.check_assign(self.clip, name, clip[name])
self.epoch = 0
self.main_program = None
def check_assign(self, dictionary, key, value):
if key not in dictionary:
raise ValueError("Invalid parameter: %s" % key)
if isinstance(value, dict) and isinstance(dictionary[key], dict):
sub_dict = dictionary[key]
for sub_name in value:
self.check_assign(sub_dict, sub_name, value[sub_name])
elif isinstance(dictionary[key],
type(value)) or (isinstance(dictionary[key], float)
and isinstance(value, (float, int))):
dictionary[key] = value
else:
if isinstance(dictionary[key], dict):
raise ValueError(
"The type of parameter %s should be a dict with keys: %s" %
(key, dictionary[key].keys()))
else:
raise ValueError("The type of parameter %s should be %s" %
(key, type(dictionary[key])))
def add_scheduler(self, name="warmup", value=0, **values):
if values:
self.check_assign(self.scheduler, name, values)
else:
self.check_assign(self.scheduler, name, value)
def add_regularization(self, name="L2", value=1e-3, **values):
if values:
self.check_assign(self.regularization, name, values)
else:
self.check_assign(self.regularization, name, value)
def add_clip(self, name="GlobalNorm", value=1.0, **values):
if values:
self.check_assign(self.clip, name, values)
else:
self.check_assign(self.clip, name, value)
def scheduler_handler(self, max_train_steps):
scheduled_lr = fluid.layers.create_global_var(
shape=[1],
value=self.learning_rate,
dtype='float32',
persistable=True,
name="learning_rate")
if not self.scheduler["slanted_triangle"]["cut_fraction"]:
warmup_steps = int(max_train_steps * self.scheduler["warmup"])
linear_decay_start = int(
max_train_steps * self.scheduler["linear_decay"]["start_point"])
if linear_decay_start < warmup_steps:
logger.warning(
"linear decay can not start during warmup process,"
"it will start after warmup ends!")
linear_decay_start = warmup_steps
if self.scheduler["noam_decay"]:
if warmup_steps > 0:
scheduled_lr = fluid.layers.learning_rate_scheduler \
.noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)),
warmup_steps)
else:
logger.warning(
"Noam decay learning rate scheduler should have positive \
warmup steps, using constant learning rate instead!")
if not self.scheduler["noam_decay"] and \
(warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1):
with self.main_program._lr_schedule_guard():
global_step = lr_scheduler._decay_step_counter()
with control_flow.Switch() as switch:
if warmup_steps > 0:
with switch.case(global_step < warmup_steps):
decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps
fluid.layers.assign(decayed_lr, scheduled_lr)
if self.scheduler["linear_decay"]["start_point"] < 1:
with switch.case(global_step >= linear_decay_start):
decayed_lr = lr_scheduler.polynomial_decay(
learning_rate=self.learning_rate,
decay_steps=max_train_steps,
end_learning_rate=self.scheduler[
"linear_decay"]["end_learning_rate"],
power=1.0,
cycle=False)
fluid.layers.assign(decayed_lr, scheduled_lr)
else:
if self.scheduler["warmup"] or self.scheduler[
"noam_decay"] or self.scheduler["linear_decay"][
"start_point"] < 1:
logger.warning(
"You are using slanted_triangle learning rate "
"which will make warmup, noam_decay and linear_decay unable"
)
cut_step = int(max_train_steps *
self.scheduler["slanted_triangle"]["cut_fraction"])
ratio = self.scheduler["slanted_triangle"]["ratio"]
global_step = lr_scheduler._decay_step_counter()
with control_flow.Switch() as switch:
with switch.case(global_step <= cut_step):
pct = global_step / cut_step
decayed_lr = self.learning_rate * (1 + pct *
(ratio - 1)) / ratio
fluid.layers.assign(decayed_lr, scheduled_lr)
with switch.default():
pct = 1 - (global_step - cut_step) / (
max_train_steps - cut_step)
decayed_lr = self.learning_rate * (1 + pct *
(ratio - 1)) / ratio
fluid.layers.assign(decayed_lr, scheduled_lr)
super(CombinedStrategy, self).__init__(
optimizer_name=self._optimizer_name, learning_rate=scheduled_lr)
if self.scheduler["discriminative"]["blocks"]:
_block_layers = math.ceil(
len(self.sorted_depth) /
self.scheduler["discriminative"]["blocks"])
power = 0
for cnt, depth in enumerate(self.sorted_depth):
for index, param in enumerate(self.depth_params_dict[depth]):
param.optimize_attr["learning_rate"] *= \
pow(1.0 / self.scheduler["discriminative"]["factor"], power)
if cnt and cnt % _block_layers == 0:
power += 1
return scheduled_lr
def clip_handler(self):
if self.clip["GlobalNorm"]:
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip["GlobalNorm"]))
elif self.clip["Norm"]:
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByNorm(clip_norm=self.clip["Norm"]))
def regularization_handler(self, loss, scheduled_lr):
if self.regularization["L2"]:
for param in self.main_program.global_block().all_parameters():
param.regularizer = fluid.regularizer.L2Decay(
regularization_coeff=self.regularization["L2"])
pretrained_params = get_pretrained_parameter(
self.main_program, fluid.default_startup_program())
if self.regularization["L2SP"]:
#TODO: L2SP can only run in one process now
for index, param in enumerate(pretrained_params):
param.regularizer = L2SPDecayRegularizer(
regularization_coeff=self.regularization["L2SP"])
_, param_grads = self.optimizer.minimize(loss)
if self.regularization["weight_decay"]:
param_list = {}
for param in self.main_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
for param, grad in param_grads:
if self.exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * self.regularization[
"weight_decay"] * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
def execute(self, loss, data_reader, config, dev_count):
# base information
self.main_program = loss.block.program
self.config = config
# self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
data_reader.data_generator(
batch_size=config.batch_size, phase='val', shuffle=False)
data_reader.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
num_train_examples = data_reader.get_num_examples(phase='train')
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * self.warmup_proportion)
scheduled_lr = adam_weight_decay_optimization(
loss, warmup_steps, max_train_steps, self.learning_rate,
main_program, self.weight_decay, self.lr_scheduler)
data_reader.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False)
num_train_examples = len(data_reader.get_train_examples())
return scheduled_lr
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
def _get_dev_count(self, config):
if config.use_cuda:
dev_count = fluid.core.get_cuda_device_count()
try:
# nlp_reader
_in_tokens = data_reader.in_tokens
if _in_tokens:
max_train_steps *= data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"] > 0:
self.depth_params_dict = get_depth_parameter(self.main_program)
self.sorted_depth = sorted(
self.depth_params_dict.keys(), reverse=True)
self.max_depth = len(self.sorted_depth)
logger.info(self.__str__())
# handle scheduler
scheduled_lr = self.scheduler_handler(max_train_steps)
# handle clip
self.clip_handler()
# handle regularization
self.regularization_handler(loss, scheduled_lr)
return scheduled_lr, max_train_steps
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
def step(self):
if self.scheduler["gradual_unfreeze"] > 0:
self.epoch += 1
if self.max_depth > 0 and self.epoch <= self.scheduler[
"gradual_unfreeze"]:
set_gradual_unfreeze(
self.main_program,
unfreeze_depths=self.
sorted_depth[:self.max_depth * self.epoch //
self.scheduler["gradual_unfreeze"]])
else:
dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
return dev_count
logger.warning(
"The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy."
% (self.max_depth))
else:
pass
# TODO complete __str__()
def __str__(self):
return "AdamWeightDecayStrategy"
return "Strategy with sheduler: %s, regularization: %s and clip: %s" % (
self.scheduler, self.regularization, self.clip)
class DefaultFinetuneStrategy(DefaultStrategy):
class AdamWeightDecayStrategy(CombinedStrategy):
def __init__(self,
learning_rate=1e-4,
optimizer_name="adam",
regularization_coeff=1e-3):
super(DefaultFinetuneStrategy, self).__init__(
learning_rate=learning_rate, optimizer_name=optimizer_name)
self.learning_rate = learning_rate
self._optimizer_name = optimizer_name
self.regularization_coeff = regularization_coeff
def execute(self, loss, data_reader, config):
# get pretrained parameters
program = loss.block.program
global_block = program.global_block()
pretrained_params = get_pretrained_parameter(
program, fluid.default_startup_program())
# set parameter attrs
for index, param in enumerate(pretrained_params):
param.regularizer = fluid.regularizer.L2Decay(
regularization_coeff=self.regularization_coeff)
if self.optimizer is not None:
self.optimizer.minimize(loss)
lr_scheduler="linear_decay",
warmup_proportion=0.1,
weight_decay=0.01,
optimizer_name="adam"):
scheduler = {"warmup": warmup_proportion}
if lr_scheduler == "noam_decay":
scheduler["noam_decay"] = True
elif lr_scheduler == "linear_decay":
scheduler["linear_decay"] = {
"start_point": warmup_proportion,
"end_learning_rate": 0,
}
else:
raise ValueError("DefaultFinetuneStrategy's optimizer is None")
raise ValueError("lr_scheduler {} is not setup "
"correctly".format(lr_scheduler))
regularization = {"weight_decay": weight_decay}
clip = {"GlobalNorm": 1.0}
super(AdamWeightDecayStrategy, self).__init__(
optimizer_name=optimizer_name,
learning_rate=learning_rate,
scheduler=scheduler,
regularization=regularization,
clip=clip)
class L2SPFinetuneStrategy(DefaultStrategy):
class L2SPFinetuneStrategy(CombinedStrategy):
def __init__(self,
learning_rate=1e-4,
optimizer_name="adam",
regularization_coeff=1e-3):
scheduler = {}
regularization = {"L2SP": regularization_coeff}
clip = {}
super(L2SPFinetuneStrategy, self).__init__(
learning_rate=learning_rate, optimizer_name=optimizer_name)
self.learning_rate = learning_rate
self._optimizer_name = optimizer_name
self.regularization_coeff = regularization_coeff
optimizer_name=optimizer_name,
learning_rate=learning_rate,
scheduler=scheduler,
regularization=regularization,
clip=clip)
def execute(self, loss, data_reader, config):
# get pretrained parameters
program = loss.block.program
global_block = program.global_block()
pretrained_params = get_pretrained_parameter(
program, fluid.default_startup_program())
# set parameter attrs
for index, param in enumerate(pretrained_params):
param.regularizer = L2SPDecayRegularizer(
regularization_coeff=self.regularization_coeff)
class DefaultFinetuneStrategy(CombinedStrategy):
def __init__(self,
learning_rate=1e-4,
optimizer_name="adam",
regularization_coeff=1e-3):
scheduler = {}
regularization = {"L2": regularization_coeff}
clip = {}
if self.optimizer is not None:
self.optimizer.minimize(loss)
else:
raise ValueError("DefaultFinetuneStrategy's optimizer is None")
super(DefaultFinetuneStrategy, self).__init__(
optimizer_name=optimizer_name,
learning_rate=learning_rate,
scheduler=scheduler,
regularization=regularization,
clip=clip)
class ULMFiTStrategy(CombinedStrategy):
def __init__(self,
learning_rate=1e-4,
optimizer_name="adam",
cut_fraction=0.1,
ratio=32,
dis_blocks=3,
factor=2.6,
frz_blocks=3):
scheduler = {
"slanted_triangle": {
"cut_fraction": cut_fraction,
"ratio": ratio
},
"gradual_unfreeze": frz_blocks,
"discriminative": {
"blocks": dis_blocks,
"factor": factor
}
}
regularization = {}
clip = {}
super(ULMFiTStrategy, self).__init__(
optimizer_name=optimizer_name,
learning_rate=learning_rate,
scheduler=scheduler,
regularization=regularization,
clip=clip)
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .basic_task import BasicTask, RunEnv, RunState
from .classifier_task import ClassifierTask, ImageClassifierTask, TextClassifierTask, MultiLabelClassifierTask
from .reading_comprehension_task import ReadingComprehensionTask
from .regression_task import RegressionTask
from .sequence_task import SequenceLabelTask
#coding:utf-8
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
......@@ -18,29 +18,19 @@ from __future__ import division
from __future__ import print_function
import os
import collections
import contextlib
import time
import multiprocessing
import copy
import numpy as np
import paddle.fluid as fluid
from visualdl import LogWriter
from tb_paddle import SummaryWriter
import paddlehub as hub
from paddlehub.common.paddle_helper import dtype_map, clone_program
from paddlehub.common.utils import mkdir, to_list
from paddlehub.common.logger import logger
from paddlehub.finetune.checkpoint import load_checkpoint, save_checkpoint
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.finetune.config import RunConfig
__all__ = [
"ClassifierTask", "ImageClassifierTask", "TextClassifierTask",
"SequenceLabelTask", "MultiLabelClassifierTask"
]
class RunState(object):
def __init__(self, length):
......@@ -92,11 +82,24 @@ class BasicTask(object):
data_reader,
main_program=None,
startup_program=None,
config=None):
config=None,
metrics_choices="default"):
# base item
self._base_data_reader = data_reader
self._base_feed_list = feed_list
# metrics item
self.best_score = -999
if metrics_choices == "default":
metrics_choices = ["acc"]
elif metrics_choices == None:
metrics_choices = []
if isinstance(metrics_choices, list):
self.metrics_choices = metrics_choices
else:
self.metrics_choices = [metrics_choices]
if main_program is None:
self._base_main_program = clone_program(
fluid.default_main_program(), for_test=False)
......@@ -138,13 +141,16 @@ class BasicTask(object):
if not os.path.exists(self.config.checkpoint_dir):
mkdir(self.config.checkpoint_dir)
vdl_log_dir = os.path.join(self.config.checkpoint_dir, "vdllog")
self.log_writer = LogWriter(vdl_log_dir, sync_cycle=1)
self.tb_writer = SummaryWriter(vdl_log_dir)
# run environment
self._phases = []
self._envs = {}
self._predict_data = None
# accelerate predict
self.is_best_model_loaded = False
# set default phase
self.enter_phase("train")
......@@ -164,9 +170,24 @@ class BasicTask(object):
def init_if_necessary(self):
if not self.is_checkpoint_loaded:
self.is_checkpoint_loaded = True
if not self.load_checkpoint():
self.exe.run(self._base_startup_program)
self.is_checkpoint_loaded = True
self.is_best_model_loaded = False
def init_if_load_best_model(self):
if not self.is_best_model_loaded:
best_model_path = os.path.join(self.config.checkpoint_dir,
"best_model")
logger.info("Load the best model from %s" % best_model_path)
if os.path.exists(best_model_path):
self.load_parameters(best_model_path)
self.is_checkpoint_loaded = False
self.is_best_model_loaded = True
else:
self.init_if_necessary()
else:
logger.info("The best model has been loaded")
def _build_env(self):
if self.env.is_inititalized:
......@@ -242,18 +263,15 @@ class BasicTask(object):
with fluid.program_guard(self.env.main_program,
self._base_startup_program):
with fluid.unique_name.guard(self.env.UNG):
self.config.strategy.execute(
self.loss, self._base_data_reader, self.config)
self.scheduled_lr, self.max_train_steps = self.config.strategy.execute(
self.loss, self._base_data_reader, self.config,
self.device_count)
if self.is_train_phase:
loss_name = self.env.loss.name
share_vars_from = None
else:
loss_name = None
if self._base_compiled_program is None:
share_vars_from = None
else:
share_vars_from = self._base_compiled_program
if not self.config.use_data_parallel:
......@@ -267,9 +285,6 @@ class BasicTask(object):
share_vars_from=share_vars_from,
build_strategy=self.build_strategy)
if self._base_compiled_program is None:
self._base_compiled_program = self.env.main_program_compiled
self.exe.run(self.env.startup_program)
self._build_env_end_event()
......@@ -348,6 +363,8 @@ class BasicTask(object):
@property
def main_program_to_be_run(self):
if self.config.use_data_parallel:
if self._base_compiled_program is None:
self._base_compiled_program = self.env.main_program_compiled
return self.main_program_compiled
return self.main_program
......@@ -420,7 +437,8 @@ class BasicTask(object):
pass
def _build_env_end_event(self):
pass
if not self.is_predict_phase:
self.env.score_scalar = {}
def _finetune_start_event(self):
logger.info("PaddleHub finetune start")
......@@ -438,14 +456,61 @@ class BasicTask(object):
logger.info("Evaluation on {} dataset start".format(self.phase))
def _eval_end_event(self, run_states):
run_speed = self._calculate_metrics(run_states)
logger.info("[%s dataset evaluation result] [step/sec: %.2f]" %
(self.phase, run_speed))
eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)
self.tb_writer.add_scalar(
tag=self.phase + "/Loss [{}]".format(self.phase),
scalar_value=eval_loss,
global_step=self.current_step)
log_scores = ""
for metric in eval_scores:
self.tb_writer.add_scalar(
tag=self.phase + "/{} [{}]".format(metric, self.phase),
scalar_value=eval_scores[metric],
global_step=self.current_step)
log_scores += "%s=%.5f " % (metric, eval_scores[metric])
logger.info(
"[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" %
(self.phase, eval_loss, log_scores, run_speed))
eval_scores_items = eval_scores.items()
if len(eval_scores_items):
# The first metric will be chose to eval
main_metric, main_value = list(eval_scores_items)[0]
else:
logger.warning(
"None of metrics has been implemented, loss will be used to evaluate."
)
# The larger, the better
main_metric, main_value = "negative loss", -eval_loss
if self.phase in ["dev", "val"] and main_value > self.best_score:
self.best_score = main_value
model_saved_dir = os.path.join(self.config.checkpoint_dir,
"best_model")
logger.info("best model saved to %s [best %s=%.5f]" %
(model_saved_dir, main_metric, main_value))
save_result = fluid.io.save_persistables(
executor=self.exe,
dirname=model_saved_dir,
main_program=self.main_program)
def _log_interval_event(self, run_states):
run_speed = self._calculate_metrics(run_states)
logger.info(
"step %d: [step/sec: %.2f]" % (self.current_step, run_speed))
scores, avg_loss, run_speed = self._calculate_metrics(run_states)
self.tb_writer.add_scalar(
tag=self.phase + "/Loss [{}]".format(self.phase),
scalar_value=avg_loss,
global_step=self.current_step)
log_scores = ""
for metric in scores:
self.tb_writer.add_scalar(
tag=self.phase + "/{} [{}]".format(metric, self.phase),
scalar_value=scores[metric],
global_step=self.current_step)
log_scores += "%s=%.5f " % (metric, scores[metric])
logger.info("step %d / %d: loss=%.5f %s[step/sec: %.2f]" %
(self.current_step, self.max_train_steps, avg_loss,
log_scores, run_speed))
def _save_ckpt_interval_event(self):
self.save_checkpoint()
......@@ -467,9 +532,14 @@ class BasicTask(object):
raise NotImplementedError
def _add_metrics(self):
# Some metrics like acc, auc can be calculated by fluid.layers
# The others can be calculated in _calculate_metrics function
raise NotImplementedError
def _calculate_metrics(self, run_states):
# NOTE: if you want to customize the metrics
# you should make sure that the first parameter returned is a dict
# The first key will be used as main metrics to update the best model
raise NotImplementedError
# NOTE: current saved checkpoint machanism is not completed,
......@@ -479,11 +549,12 @@ class BasicTask(object):
checkpoint_dir=self.config.checkpoint_dir,
current_epoch=self.current_epoch,
global_step=self.current_step,
best_score=self.best_score,
exe=self.exe,
main_program=self.main_program)
def load_checkpoint(self):
is_load_successful, self.env.current_epoch, self.env.current_step = load_checkpoint(
is_load_successful, self.env.current_epoch, self.env.current_step, self.best_score = load_checkpoint(
self.config.checkpoint_dir,
self.exe,
main_program=self.main_program)
......@@ -513,23 +584,29 @@ class BasicTask(object):
run_states = []
if self.current_epoch <= self.config.num_epoch:
while self.current_epoch <= self.config.num_epoch:
self.config.strategy.step()
run_states = self._run(do_eval=do_eval)
self.env.current_epoch += 1
# Save checkpoint after finetune
self.save_checkpoint()
# Final evaluation
if self._base_data_reader.get_dev_examples() != []:
self.eval(phase="dev")
if self._base_data_reader.get_test_examples() != []:
self.eval(phase="test")
self.eval(phase="test", load_best_model=True)
self._finetune_end_event(run_states)
return run_states
def eval(self, phase="dev"):
def eval(self, phase="dev", load_best_model=False):
# Warning: DO NOT use eval(load_best_model=True) in finetune_and_eval
# It will cause trainer unable to continue training from checkpoint after eval
# More important, The model should evaluate current performance during training.
with self.phase_guard(phase=phase):
if load_best_model:
self.init_if_load_best_model()
else:
self.init_if_necessary()
self._eval_start_event()
run_states = self._run()
......@@ -538,11 +615,10 @@ class BasicTask(object):
def predict(self, data, load_best_model=True):
with self.phase_guard(phase="predict"):
self.init_if_necessary()
if load_best_model:
best_model_path = os.path.join(self.config.checkpoint_dir,
"best_model")
self.load_parameters(best_model_path)
self.init_if_load_best_model()
else:
self.init_if_necessary()
self._predict_data = data
self._predict_start_event()
run_states = self._run()
......@@ -567,7 +643,6 @@ class BasicTask(object):
for run_step, batch in enumerate(self.reader(), start=1):
if self.config.use_data_parallel and len(batch) < self.device_count:
continue
step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1
num_batch_examples = len(batch)
......@@ -652,460 +727,3 @@ class BasicTask(object):
break
return global_run_states
class ClassifierTask(BasicTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None):
main_program = feature.block.program
super(ClassifierTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config)
self.feature = feature
self.num_classes = num_classes
self.hidden_units = hidden_units
self.best_accuracy = -1
def _build_net(self):
cls_feats = self.feature
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
logits = fluid.layers.fc(
input=cls_feats,
size=self.num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
act="softmax")
return [logits]
def _add_label(self):
return [fluid.layers.data(name="label", dtype="int64", shape=[1])]
def _add_loss(self):
ce_loss = fluid.layers.cross_entropy(
input=self.outputs[0], label=self.labels[0])
return fluid.layers.mean(x=ce_loss)
def _add_metrics(self):
return [
fluid.layers.accuracy(input=self.outputs[0], label=self.labels[0])
]
def _build_env_end_event(self):
with self.log_writer.mode(self.phase) as logw:
if not self.is_predict_phase:
self.env.loss_scalar = logw.scalar(
tag="Loss [{}]".format(self.phase))
self.env.acc_scalar = logw.scalar(
tag="Accuracy [{}]".format(self.phase))
def _calculate_metrics(self, run_states):
loss_sum = acc_sum = run_examples = 0
run_step = run_time_used = 0
for run_state in run_states:
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean(
run_state.run_results[0]) * run_state.run_examples
run_time_used = time.time() - run_states[0].run_time_begin
avg_loss = loss_sum / run_examples
avg_acc = acc_sum / run_examples
run_speed = run_step / run_time_used
return avg_loss, avg_acc, run_speed
def _log_interval_event(self, run_states):
avg_loss, avg_acc, run_speed = self._calculate_metrics(run_states)
self.env.loss_scalar.add_record(self.current_step, avg_loss)
self.env.acc_scalar.add_record(self.current_step, avg_acc)
logger.info("step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" %
(self.current_step, avg_loss, avg_acc, run_speed))
def _eval_end_event(self, run_states):
eval_loss, eval_acc, run_speed = self._calculate_metrics(run_states)
logger.info(
"[%s dataset evaluation result] loss=%.5f acc=%.5f [step/sec: %.2f]"
% (self.phase, eval_loss, eval_acc, run_speed))
self.env.loss_scalar.add_record(self.current_step, eval_loss)
self.env.acc_scalar.add_record(self.current_step, eval_acc)
if self.phase in ["dev", "val"] and eval_acc > self.best_accuracy:
self.best_accuracy = eval_acc
model_saved_dir = os.path.join(self.config.checkpoint_dir,
"best_model")
logger.info("best model saved to %s [best accuracy=%.5f]" %
(model_saved_dir, self.best_accuracy))
save_result = fluid.io.save_persistables(
executor=self.exe,
dirname=model_saved_dir,
main_program=self.main_program)
ImageClassifierTask = ClassifierTask
class TextClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None):
main_program = feature.block.program
super(TextClassifierTask, self).__init__(
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
feed_list=feed_list,
startup_program=startup_program,
config=config,
hidden_units=hidden_units)
def _build_net(self):
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
logits = fluid.layers.fc(
input=cls_feats,
size=self.num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
act="softmax")
return [logits]
class SequenceLabelTask(BasicTask):
def __init__(
self,
feature,
max_seq_len,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
):
main_program = feature.block.program
super(SequenceLabelTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config)
self.feature = feature
self.max_seq_len = max_seq_len
self.num_classes = num_classes
self.best_f1 = -1
def _build_net(self):
self.logits = fluid.layers.fc(
input=self.feature,
size=self.num_classes,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
logits = self.logits
logits = fluid.layers.flatten(logits, axis=2)
logits = fluid.layers.softmax(logits)
self.num_labels = logits.shape[1]
return [logits]
def _add_label(self):
label = fluid.layers.data(
name="label", shape=[self.max_seq_len, 1], dtype='int64')
return [label]
def _add_loss(self):
labels = fluid.layers.flatten(self.labels[0], axis=2)
ce_loss = fluid.layers.cross_entropy(
input=self.outputs[0], label=labels)
loss = fluid.layers.mean(x=ce_loss)
return loss
def _add_metrics(self):
self.ret_labels = fluid.layers.reshape(x=self.labels[0], shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len]
def _build_env_end_event(self):
with self.log_writer.mode(self.phase) as logw:
if self.is_train_phase:
self.env.loss_scalar = logw.scalar(
tag="Loss [{}]".format(self.phase))
if self.phase in ["dev", "val"]:
self.env.loss_scalar = logw.scalar(
tag="Loss [{}]".format(self.phase))
self.env.f1_scalar = logw.scalar(
tag="F1 [{}]".format(self.phase))
self.env.precision_scalar = logw.scalar(
tag="Precision [{}]".format(self.phase))
self.env.recall_scalar = logw.scalar(
tag="Recall [{}]".format(self.phase))
def _calculate_metrics(self, run_states):
total_infer = total_label = total_correct = loss_sum = 0
run_step = run_time_used = run_examples = 0
for run_state in run_states:
loss_sum += np.mean(run_state.run_results[-1])
np_labels = run_state.run_results[0]
np_infers = run_state.run_results[1]
np_lens = run_state.run_results[2]
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, self.num_labels,
self.device_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
run_examples += run_state.run_examples
run_step += run_state.run_step
run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used
avg_loss = loss_sum / run_examples
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
return precision, recall, f1, avg_loss, run_speed
def _log_interval_event(self, run_states):
precision, recall, f1, avg_loss, run_speed = self._calculate_metrics(
run_states)
self.env.loss_scalar.add_record(self.current_step, avg_loss)
logger.info("step %d: loss=%.5f [step/sec: %.2f]" %
(self.current_step, avg_loss, run_speed))
def _eval_end_event(self, run_states):
precision, recall, f1, avg_loss, run_speed = self._calculate_metrics(
run_states)
self.env.loss_scalar.add_record(self.current_step, avg_loss)
self.env.f1_scalar.add_record(self.current_step, f1)
self.env.precision_scalar.add_record(self.current_step, precision)
self.env.recall_scalar.add_record(self.current_step, recall)
logger.info("[%s dataset evaluation result] [step/sec: %.2f]" %
(self.phase, run_speed))
logger.info(
"[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]"
% (self.phase, f1, precision, recall, run_speed))
if self.phase in ["dev", "val"] and f1 > self.best_f1:
self.best_f1 = f1
model_saved_dir = os.path.join(self.config.checkpoint_dir,
"best_model")
logger.info("best model saved to %s [best F1=%.5f]" %
(model_saved_dir, self.best_f1))
fluid.io.save_persistables(self.exe, dirname=model_saved_dir)
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
return feed_list
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name]
return [output.name for output in self.outputs]
class MultiLabelClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None):
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__(
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
feed_list=feed_list,
startup_program=startup_program,
config=config,
hidden_units=hidden_units)
self.best_avg_auc = -1
def _build_net(self):
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
probs = []
for i in range(self.num_classes):
probs.append(
fluid.layers.fc(
input=cls_feats,
size=2,
param_attr=fluid.ParamAttr(
name="cls_out_w_%d" % i,
initializer=fluid.initializer.TruncatedNormal(
scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b_%d" % i,
initializer=fluid.initializer.Constant(0.)),
act="softmax"))
return probs
def _add_label(self):
label = fluid.layers.data(
name="label", shape=[self.num_classes], dtype='int64')
return [label]
def _add_loss(self):
label_split = fluid.layers.split(
self.labels[0], self.num_classes, dim=-1)
total_loss = fluid.layers.fill_constant(
shape=[1], value=0.0, dtype='float64')
for index, probs in enumerate(self.outputs):
ce_loss = fluid.layers.cross_entropy(
input=probs, label=label_split[index])
total_loss += fluid.layers.reduce_sum(ce_loss)
loss = fluid.layers.mean(x=total_loss)
return loss
def _add_metrics(self):
label_split = fluid.layers.split(
self.labels[0], self.num_classes, dim=-1)
# metrics change to auc of every class
eval_list = []
for index, probs in enumerate(self.outputs):
current_auc, _, _ = fluid.layers.auc(
input=probs, label=label_split[index])
eval_list.append(current_auc)
return eval_list
def _build_env_end_event(self):
with self.log_writer.mode(self.phase) as logw:
if not self.is_predict_phase:
self.env.loss_scalar = logw.scalar(
tag="Loss [{}]".format(self.phase))
if self.is_train_phase:
self.env.auc_scalar_list = []
for i in range(self.num_classes):
self.env.auc_scalar_list.append(
logw.scalar(tag="AUC_{} [{}]".format(i, "train")))
self.env.avg_auc_scalar = logw.scalar(
tag="Average auc [{}]".format(self.phase))
def _calculate_metrics(self, run_states):
loss_sum = acc_sum = run_examples = 0
run_step = run_time_used = 0
for run_state in run_states:
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-1]) * run_state.run_examples
auc_list = run_states[-1].run_results[:-1]
run_time_used = time.time() - run_states[0].run_time_begin
avg_loss = loss_sum / (run_examples * self.num_classes)
run_speed = run_step / run_time_used
return avg_loss, auc_list, run_speed
def _log_interval_event(self, run_states):
avg_loss, auc_list, run_speed = self._calculate_metrics(run_states)
self.env.loss_scalar.add_record(self.current_step, avg_loss)
avg_auc = np.mean(auc_list)
self.env.avg_auc_scalar.add_record(self.current_step, avg_auc)
logger.info("step %d: loss=%.5f avg_auc=%.5f [step/sec: %.2f]" %
(self.current_step, avg_loss, avg_auc, run_speed))
for index, auc_scalar in enumerate(self.env.auc_scalar_list):
auc_scalar.add_record(self.current_step, auc_list[index][0])
logger.info("label_%d_auc = %.5f" % (index, auc_list[index][0]))
def _eval_end_event(self, run_states):
eval_loss, auc_list, run_speed = self._calculate_metrics(run_states)
avg_auc = np.mean(auc_list)
logger.info(
"[%s dataset evaluation result] loss=%.5f avg_auc=%.5f [step/sec: %.2f]"
% (self.phase, eval_loss, avg_auc, run_speed))
for index, auc in enumerate(auc_list):
logger.info("label_%d_auc = %.5f" % (index, auc_list[index][0]))
self.env.loss_scalar.add_record(self.current_step, eval_loss)
self.env.avg_auc_scalar.add_record(self.current_step, avg_auc)
if self.phase in ["dev", "val"] and avg_auc > self.best_avg_auc:
self.best_avg_auc = avg_auc
model_saved_dir = os.path.join(self.config.checkpoint_dir,
"best_model")
logger.info("best model saved to %s [best average auc=%.5f]" %
(model_saved_dir, self.best_avg_auc))
save_result = fluid.io.save_persistables(
executor=self.exe,
dirname=model_saved_dir,
main_program=self.main_program)
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
return self.outputs
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from .basic_task import BasicTask
class ClassifierTask(BasicTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
if metrics_choices == "default":
metrics_choices = ["acc"]
main_program = feature.block.program
super(ClassifierTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config,
metrics_choices=metrics_choices)
self.feature = feature
self.num_classes = num_classes
self.hidden_units = hidden_units
def _build_net(self):
cls_feats = self.feature
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
logits = fluid.layers.fc(
input=cls_feats,
size=self.num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
act="softmax")
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1])
return [logits]
def _add_label(self):
return [fluid.layers.data(name="label", dtype="int64", shape=[1])]
def _add_loss(self):
ce_loss = fluid.layers.cross_entropy(
input=self.outputs[0], label=self.labels[0])
return fluid.layers.mean(x=ce_loss)
def _add_metrics(self):
acc = fluid.layers.accuracy(input=self.outputs[0], label=self.labels[0])
return [acc]
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [self.labels[0].name, self.ret_infers.name
] + [metric.name
for metric in self.metrics] + [self.loss.name]
return [output.name for output in self.outputs]
def _calculate_metrics(self, run_states):
loss_sum = acc_sum = run_examples = 0
run_step = run_time_used = 0
all_labels = np.array([])
all_infers = np.array([])
for run_state in run_states:
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean(
run_state.run_results[2]) * run_state.run_examples
np_labels = run_state.run_results[0]
np_infers = run_state.run_results[1]
all_labels = np.hstack((all_labels, np_labels.reshape([-1])))
all_infers = np.hstack((all_infers, np_infers.reshape([-1])))
run_time_used = time.time() - run_states[0].run_time_begin
avg_loss = loss_sum / run_examples
run_speed = run_step / run_time_used
# The first key will be used as main metrics to update the best model
scores = OrderedDict()
for metric in self.metrics_choices:
if metric == "acc":
avg_acc = acc_sum / run_examples
scores["acc"] = avg_acc
elif metric == "f1":
f1 = calculate_f1_np(all_infers, all_labels)
scores["f1"] = f1
elif metric == "matthews":
matthews = matthews_corrcoef(all_infers, all_labels)
scores["matthews"] = matthews
else:
raise ValueError("Not Support Metric: \"%s\"" % metric)
return scores, avg_loss, run_speed
ImageClassifierTask = ClassifierTask
class TextClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
if metrics_choices == "default":
metrics_choices = ["acc"]
super(TextClassifierTask, self).__init__(
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
feed_list=feed_list,
startup_program=startup_program,
config=config,
hidden_units=hidden_units,
metrics_choices=metrics_choices)
def _build_net(self):
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
logits = fluid.layers.fc(
input=cls_feats,
size=self.num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
act="softmax")
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1])
return [logits]
class MultiLabelClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
if metrics_choices == "default":
metrics_choices = ["auc"]
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__(
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
feed_list=feed_list,
startup_program=startup_program,
config=config,
hidden_units=hidden_units,
metrics_choices=metrics_choices)
self.class_name = list(data_reader.label_map.keys())
def _build_net(self):
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
probs = []
for i in range(self.num_classes):
probs.append(
fluid.layers.fc(
input=cls_feats,
size=2,
param_attr=fluid.ParamAttr(
name="cls_out_w_%d" % i,
initializer=fluid.initializer.TruncatedNormal(
scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b_%d" % i,
initializer=fluid.initializer.Constant(0.)),
act="softmax"))
return probs
def _add_label(self):
label = fluid.layers.data(
name="label", shape=[self.num_classes], dtype='int64')
return [label]
def _add_loss(self):
label_split = fluid.layers.split(
self.labels[0], self.num_classes, dim=-1)
total_loss = fluid.layers.fill_constant(
shape=[1], value=0.0, dtype='float64')
for index, probs in enumerate(self.outputs):
ce_loss = fluid.layers.cross_entropy(
input=probs, label=label_split[index])
total_loss += fluid.layers.reduce_sum(ce_loss)
loss = fluid.layers.mean(x=total_loss)
return loss
def _add_metrics(self):
label_split = fluid.layers.split(
self.labels[0], self.num_classes, dim=-1)
# metrics change to auc of every class
eval_list = []
for index, probs in enumerate(self.outputs):
current_auc, _, _ = fluid.layers.auc(
input=probs, label=label_split[index])
eval_list.append(current_auc)
return eval_list
def _calculate_metrics(self, run_states):
loss_sum = acc_sum = run_examples = 0
run_step = run_time_used = 0
for run_state in run_states:
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-1]) * run_state.run_examples
auc_list = run_states[-1].run_results[:-1]
run_time_used = time.time() - run_states[0].run_time_begin
avg_loss = loss_sum / (run_examples * self.num_classes)
run_speed = run_step / run_time_used
# The first key will be used as main metrics to update the best model
scores = OrderedDict()
for metric in self.metrics_choices:
if metric == "auc":
scores["auc"] = np.mean(auc_list)
# NOTE: for MultiLabelClassifierTask, the metrics will be used to evaluate all the label
# and their mean value will also be reported.
for index, auc in enumerate(auc_list):
scores["auc_" + self.class_name[index]] = auc_list[index][0]
else:
raise ValueError("Not Support Metric: \"%s\"" % metric)
return scores, avg_loss, run_speed
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
return self.outputs
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from .basic_task import BasicTask
class ReadingComprehensionTask(BasicTask):
def __init__(self,
feature,
feed_list,
data_reader,
startup_program=None,
config=None,
metrics_choices=None):
main_program = feature.block.program
super(ReadingComprehensionTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config,
metrics_choices=metrics_choices)
self.feature = feature
def _build_net(self):
if self.is_predict_phase:
self.unique_id = fluid.layers.data(
name="start_positions",
shape=[-1, 1],
lod_level=0,
dtype="int64")
logits = fluid.layers.fc(
input=self.feature,
size=2,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
batch_ones = fluid.layers.fill_constant_batch_size_like(
input=start_logits, dtype='int64', shape=[1], value=1)
num_seqs = fluid.layers.reduce_sum(input=batch_ones)
return [start_logits, end_logits, num_seqs]
def _add_label(self):
start_positions = fluid.layers.data(
name="start_positions", shape=[-1, 1], lod_level=0, dtype="int64")
end_positions = fluid.layers.data(
name="end_positions", shape=[-1, 1], lod_level=0, dtype="int64")
return [start_positions, end_positions]
def _add_loss(self):
start_positions = self.labels[0]
end_positions = self.labels[1]
start_logits = self.outputs[0]
end_logits = self.outputs[1]
start_loss = fluid.layers.softmax_with_cross_entropy(
logits=start_logits, label=start_positions)
start_loss = fluid.layers.mean(x=start_loss)
end_loss = fluid.layers.softmax_with_cross_entropy(
logits=end_logits, label=end_positions)
end_loss = fluid.layers.mean(x=end_loss)
total_loss = (start_loss + end_loss) / 2.0
return total_loss
def _add_metrics(self):
return []
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase:
feed_list += [self.labels[0].name, self.labels[1].name]
elif self.is_predict_phase:
feed_list += [self.unique_id.name]
return feed_list
@property
def fetch_list(self):
if self.is_train_phase:
return [metric.name for metric in self.metrics
] + [self.loss.name, self.outputs[-1].name]
elif self.is_predict_phase:
return [self.unique_id.name
] + [output.name for output in self.outputs]
def _calculate_metrics(self, run_states):
total_cost, total_num_seqs = [], []
run_step = run_time_used = run_examples = 0
for run_state in run_states:
np_loss = run_state.run_results[0]
np_num_seqs = run_state.run_results[1]
total_cost.extend(np_loss * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
run_examples += run_state.run_examples
run_step += run_state.run_step
run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used
avg_loss = np.sum(total_cost) / np.sum(total_num_seqs)
scores = OrderedDict()
# If none of metrics has been implemented, loss will be used to evaluate.
return scores, avg_loss, run_speed
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from scipy.stats import spearmanr
from .basic_task import BasicTask
class RegressionTask(BasicTask):
def __init__(self,
feature,
feed_list,
data_reader,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
if metrics_choices == "default":
metrics_choices = ["spearman"]
main_program = feature.block.program
super(RegressionTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config,
metrics_choices=metrics_choices)
self.feature = feature
self.hidden_units = hidden_units
def _build_net(self):
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
if self.hidden_units is not None:
for n_hidden in self.hidden_units:
cls_feats = fluid.layers.fc(
input=cls_feats, size=n_hidden, act="relu")
logits = fluid.layers.fc(
input=cls_feats,
size=1,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
act=None)
return [logits]
def _add_label(self):
return [fluid.layers.data(name="label", dtype="float32", shape=[1])]
def _add_loss(self):
cost = fluid.layers.square_error_cost(
input=self.outputs[0], label=self.labels[0])
return fluid.layers.mean(x=cost)
def _add_metrics(self):
return []
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [self.labels[0].name, self.outputs[0].name
] + [metric.name
for metric in self.metrics] + [self.loss.name]
return [output.name for output in self.outputs]
def _calculate_metrics(self, run_states):
loss_sum = run_examples = 0
run_step = run_time_used = 0
all_labels = np.array([])
all_infers = np.array([])
for run_state in run_states:
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-1]) * run_state.run_examples
np_labels = run_state.run_results[0]
np_infers = run_state.run_results[1]
all_labels = np.hstack((all_labels, np_labels.reshape([-1])))
all_infers = np.hstack((all_infers, np_infers.reshape([-1])))
run_time_used = time.time() - run_states[0].run_time_begin
avg_loss = loss_sum / run_examples
run_speed = run_step / run_time_used
# The first key will be used as main metrics to update the best model
scores = OrderedDict()
for metric in self.metrics_choices:
if metric == "spearman":
spearman_correlations = spearmanr(all_labels, all_infers)[0]
scores["spearman"] = spearman_correlations
else:
raise ValueError("Not Support Metric: \"%s\"" % metric)
return scores, avg_loss, run_speed
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from .basic_task import BasicTask
class SequenceLabelTask(BasicTask):
def __init__(self,
feature,
max_seq_len,
num_classes,
feed_list,
data_reader,
startup_program=None,
config=None,
metrics_choices="default"):
if metrics_choices == "default":
metrics_choices = ["f1", "precision", "recall"]
main_program = feature.block.program
super(SequenceLabelTask, self).__init__(
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
startup_program=startup_program,
config=config,
metrics_choices=metrics_choices)
self.feature = feature
self.max_seq_len = max_seq_len
self.num_classes = num_classes
def _build_net(self):
self.logits = fluid.layers.fc(
input=self.feature,
size=self.num_classes,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(
name="cls_seq_label_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_seq_label_out_b",
initializer=fluid.initializer.Constant(0.)))
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
logits = self.logits
logits = fluid.layers.flatten(logits, axis=2)
logits = fluid.layers.softmax(logits)
self.num_labels = logits.shape[1]
return [logits]
def _add_label(self):
label = fluid.layers.data(
name="label", shape=[self.max_seq_len, 1], dtype='int64')
return [label]
def _add_loss(self):
labels = fluid.layers.flatten(self.labels[0], axis=2)
ce_loss = fluid.layers.cross_entropy(
input=self.outputs[0], label=labels)
loss = fluid.layers.mean(x=ce_loss)
return loss
def _add_metrics(self):
self.ret_labels = fluid.layers.reshape(x=self.labels[0], shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len]
def _calculate_metrics(self, run_states):
total_infer = total_label = total_correct = loss_sum = 0
run_step = run_time_used = run_examples = 0
for run_state in run_states:
loss_sum += np.mean(run_state.run_results[-1])
np_labels = run_state.run_results[0]
np_infers = run_state.run_results[1]
np_lens = run_state.run_results[2]
label_num, infer_num, correct_num = chunk_eval(
np_labels, np_infers, np_lens, self.num_labels,
self.device_count)
total_infer += infer_num
total_label += label_num
total_correct += correct_num
run_examples += run_state.run_examples
run_step += run_state.run_step
run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used
avg_loss = loss_sum / run_examples
precision, recall, f1 = calculate_f1(total_label, total_infer,
total_correct)
# The first key will be used as main metrics to update the best model
scores = OrderedDict()
for metric in self.metrics_choices:
if metric == "precision":
scores["precision"] = precision
elif metric == "recall":
scores["recall"] = recall
elif metric == "f1":
scores["f1"] = f1
else:
raise ValueError("Not Support Metric: \"%s\"" % metric)
return scores, avg_loss, run_speed
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
return feed_list
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name]
return [output.name for output in self.outputs]
......@@ -26,6 +26,7 @@ from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import MODULE_HOME
from paddlehub.module import module_desc_pb2
import paddlehub as hub
from paddlehub.common.logger import logger
class LocalModuleManager(object):
......@@ -35,23 +36,26 @@ class LocalModuleManager(object):
if not os.path.exists(self.local_modules_dir):
utils.mkdir(self.local_modules_dir)
elif os.path.isfile(self.local_modules_dir):
#TODO(wuzewu): give wanring
pass
raise ValueError("Module home should be a folder, not a file")
def check_module_valid(self, module_path):
#TODO(wuzewu): code
info = {}
try:
desc_pb_path = os.path.join(module_path, 'module_desc.pb')
if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path):
info = {}
desc = module_desc_pb2.ModuleDesc()
with open(desc_pb_path, "rb") as fp:
desc.ParseFromString(fp.read())
info['version'] = desc.attr.map.data["module_info"].map.data[
"version"].s
return True, info
else:
logger.warning(
"%s does not exist, the module will be reinstalled" %
desc_pb_path)
except:
pass
return False, None
return True, info
def all_modules(self, update=False):
if not update and self.modules_dict:
......@@ -60,7 +64,6 @@ class LocalModuleManager(object):
for sub_dir_name in os.listdir(self.local_modules_dir):
sub_dir_path = os.path.join(self.local_modules_dir, sub_dir_name)
if os.path.isdir(sub_dir_path):
#TODO(wuzewu): get module name
valid, info = self.check_module_valid(sub_dir_path)
if valid:
module_name = sub_dir_name
......@@ -92,7 +95,6 @@ class LocalModuleManager(object):
url = search_result.get('url', None)
md5_value = search_result.get('md5', None)
installed_module_version = search_result.get('version', None)
#TODO(wuzewu): add compatibility check
if not url or (module_version is not None and installed_module_version
!= module_version) or (name != module_name):
tips = "Can't find module %s" % module_name
......
......@@ -117,7 +117,6 @@ class Module(object):
self.cache_fetch_dict = None
self.cache_program = None
# TODO(wuzewu): print more module loading info log
if name:
self._init_with_name(name=name, version=version)
elif module_dir:
......@@ -458,7 +457,6 @@ class Module(object):
fetch_dict = self.cache_fetch_dict
program = self.cache_program
#TODO(wuzewu): more option
fetch_list = list(set([value for key, value in fetch_dict.items()]))
with fluid.program_guard(program):
result = []
......@@ -554,7 +552,6 @@ class Module(object):
self._recover_variable_info(program)
paddle_helper.set_op_attr(program, is_test=for_test)
#TODO(wuzewu): return feed_list and fetch_list directly
feed_dict = {}
fetch_dict = {}
for index, var in enumerate(signature.inputs):
......@@ -569,7 +566,6 @@ class Module(object):
if key:
fetch_dict[key] = program.global_block().var(var.name)
# TODO(ZeyuChen) encapsulate into a funtion
# update BERT/ERNIE's input tensor's sequence length to max_seq_len
if self.name.startswith("bert") or self.name.startswith("ernie"):
MAX_SEQ_LENGTH = 512
......
......@@ -17,4 +17,6 @@ from .nlp_reader import ClassifyReader
from .nlp_reader import SequenceLabelReader
from .nlp_reader import LACClassifyReader
from .nlp_reader import MultiLabelClassifyReader
from .nlp_reader import ReadingComprehensionReader
from .nlp_reader import RegressionReader
from .cv_reader import ImageClassificationReader
......@@ -17,7 +17,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import csv
import collections
import json
import numpy as np
import platform
......@@ -31,7 +31,7 @@ from paddlehub.reader import tokenization
from paddlehub.common.logger import logger
from paddlehub.common.utils import sys_stdout_encoding
from paddlehub.dataset.dataset import InputExample
from .batching import pad_batch_data
from .batching import pad_batch_data, prepare_batch_data
import paddlehub as hub
......@@ -43,7 +43,8 @@ class BaseReader(object):
max_seq_len=512,
do_lower_case=True,
random_seed=None,
use_task_id=False):
use_task_id=False,
in_tokens=False):
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
......@@ -52,7 +53,7 @@ class BaseReader(object):
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.in_tokens = False
self.in_tokens = in_tokens
self.use_task_id = use_task_id
if self.use_task_id:
......@@ -202,6 +203,9 @@ class BaseReader(object):
return record
def _pad_batch_records(self, batch_records, phase):
raise NotImplementedError
def _prepare_batch_data(self, examples, batch_size, phase=None):
"""generate batch records"""
batch_records, max_len = [], 0
......@@ -494,7 +498,7 @@ class SequenceLabelReader(BaseReader):
class LACClassifyReader(object):
def __init__(self, dataset, vocab_path):
def __init__(self, dataset, vocab_path, in_tokens=False):
self.dataset = dataset
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer(
......@@ -505,6 +509,7 @@ class LACClassifyReader(object):
sign_name="lexical_analysis").keys())[0]
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
self.in_tokens = in_tokens
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
......@@ -719,5 +724,536 @@ class MultiLabelClassifyReader(BaseReader):
return record
class SquadInputFeatures(object):
"""A single set of features of squad_data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
class RegressionReader(BaseReader):
def __init__(self,
dataset,
vocab_path,
label_map_config=None,
max_seq_len=128,
do_lower_case=True,
random_seed=None):
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.dataset = dataset
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.in_tokens = False
np.random.seed(random_seed)
# generate label map
self.label_map = {} # Unlike BaseReader, it's not filled
self.current_example = 0
self.current_epoch = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def _pad_batch_records(self, batch_records, phase=None):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
batch_position_ids = [record.position_ids for record in batch_records]
padded_token_ids, input_mask = pad_batch_data(
batch_token_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id,
return_input_mask=True)
padded_text_type_ids = pad_batch_data(
batch_text_type_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
padded_position_ids = pad_batch_data(
batch_position_ids,
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
if phase != "predict":
batch_labels = [record.label_id for record in batch_records]
# the only diff with ClassifyReader: astype("float32")
batch_labels = np.array(batch_labels).astype("float32").reshape(
[-1, 1])
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
return return_list
def data_generator(self,
batch_size=1,
phase='train',
shuffle=True,
data=None):
if phase == 'train':
shuffle = True
examples = self.get_train_examples()
self.num_examples['train'] = len(examples)
elif phase == 'val' or phase == 'dev':
shuffle = False
examples = self.get_dev_examples()
self.num_examples['dev'] = len(examples)
elif phase == 'test':
shuffle = False
examples = self.get_test_examples()
self.num_examples['test'] = len(examples)
elif phase == 'predict':
shuffle = False
examples = []
seq_id = 0
for item in data:
# set label in order to run the program
label = -1 # different from BaseReader
if len(item) == 1:
item_i = InputExample(
guid=seq_id, text_a=item[0], label=label)
elif len(item) == 2:
item_i = InputExample(
guid=seq_id,
text_a=item[0],
text_b=item[1],
label=label)
else:
raise ValueError(
"The length of input_text is out of handling, which must be 1 or 2!"
)
examples.append(item_i)
seq_id += 1
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test', 'predict']."
)
def wrapper():
if shuffle:
np.random.shuffle(examples)
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield [batch_data]
return wrapper
class ReadingComprehensionReader(object):
def __init__(self,
dataset,
vocab_path,
do_lower_case=True,
max_seq_length=512,
doc_stride=128,
max_query_length=64,
random_seed=None):
self.dataset = dataset
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = False
np.random.seed(random_seed)
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example
def get_train_examples(self):
"""Gets a collection of `SquadExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `SquadExample`s for the dev set."""
return self.dataset.get_dev_examples()
def get_test_examples(self):
"""Gets a collection of `SquadExample`s for prediction."""
return self.dataset.get_test_examples()
def get_num_examples(self, phase):
if phase not in ['train', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def data_generator(self,
batch_size=1,
phase='train',
shuffle=False,
data=None):
if phase == 'train':
shuffle = True
examples = self.get_train_examples()
self.num_examples['train'] = len(examples)
elif phase == 'dev':
shuffle = False
examples = self.get_dev_examples()
self.num_examples['dev'] = len(examples)
elif phase == 'test':
shuffle = False
examples = self.get_test_examples()
self.num_examples['test'] = len(examples)
elif phase == 'predict':
shuffle = False
examples = data
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test', 'predict']."
)
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
feature.input_ids, feature.segment_ids,
range(seq_len)
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
def wrapper():
if shuffle:
np.random.shuffle(examples)
if phase == "train":
features = self.convert_examples_to_features(
examples, is_training=True)
else:
features = self.convert_examples_to_features(
examples, is_training=False)
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
self._max_seq_length,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
yield [batch_data]
return wrapper
def convert_examples_to_features(self, examples, is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
query_tokens = self._tokenizer.tokenize(example.question_text)
if len(query_tokens) > self._max_query_length:
query_tokens = query_tokens[0:self._max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = self._tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position,
tok_end_position) = self.improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position,
self._tokenizer, example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = self._max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self._doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = self.check_is_max_context(
doc_spans, doc_span_index, split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start
and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
if example_index < 3:
logger.debug("*** Example ***")
logger.debug("unique_id: %s" % (unique_id))
logger.debug("example_index: %s" % (example_index))
logger.debug("doc_span_index: %s" % (doc_span_index))
logger.debug("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
logger.debug("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
logger.debug("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
logger.debug(
"input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.debug("input_mask: %s" % " ".join(
[str(x) for x in input_mask]))
logger.debug("segment_ids: %s" % " ".join(
[str(x) for x in segment_ids]))
if is_training and example.is_impossible:
logger.debug("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(
tokens[start_position:(end_position + 1)])
logger.debug("start_position: %d" % (start_position))
logger.debug("end_position: %d" % (end_position))
logger.debug("answer: %s" %
(tokenization.printable_text(answer_text)))
feature = SquadInputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def check_is_max_context(self, doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
if __name__ == '__main__':
pass
......@@ -21,7 +21,6 @@ from __future__ import print_function
import collections
import io
import unicodedata
import six
......
visualdl >= 1.3.0
pre-commit
protobuf >= 3.1.0
yapf == 0.26.0
......@@ -12,3 +11,5 @@ requests
pandas
#[py2]pandas == 0.24.0
flake8
tb-paddle
cma == 2.7.0
......@@ -32,7 +32,7 @@ max_version, mid_version, min_version = python_version()
REQUIRED_PACKAGES = [
'six >= 1.10.0', 'protobuf >= 3.1.0', 'pyyaml', 'Pillow', 'requests',
"visualdl >= 1.3.0"
'visualdl >= 1.3.0', 'cma == 2.7.0'
]
if max_version < 3:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册