diff --git a/demo/image-classification/img_classifier.py b/demo/image-classification/img_classifier.py index 5083c972d793a0f2b7cf340c39cb25a68b74b130..27acea75a70425ef447de9d6215ab9af57e92854 100644 --- a/demo/image-classification/img_classifier.py +++ b/demo/image-classification/img_classifier.py @@ -10,13 +10,13 @@ import numpy as np # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") -parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for fine-tuning.") parser.add_argument("--checkpoint_dir", type=str, default="paddlehub_finetune_ckpt", help="Path to save log data.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--module", type=str, default="resnet50", help="Module used as feature extractor.") parser.add_argument("--dataset", type=str, default="flowers", help="Dataset to finetune.") -parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") -parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") # yapf: enable. module_map = { diff --git a/demo/reading-comprehension/evaluate_v1.py b/demo/reading-comprehension/evaluate_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..120aceaee9e48cc42767dcadb7c47fae0f8e7caa --- /dev/null +++ b/demo/reading-comprehension/evaluate_v1.py @@ -0,0 +1,98 @@ +""" Official evaluation script for v1.1 of the SQuAD dataset. """ +from __future__ import print_function +from collections import Counter +import string +import re +import argparse +import json +import sys + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return (normalize_answer(prediction) == normalize_answer(ground_truth)) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths(f1_score, prediction, + ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return {'exact_match': exact_match, 'f1': f1} + + +if __name__ == '__main__': + expected_version = '1.1' + parser = argparse.ArgumentParser( + description='Evaluation for SQuAD ' + expected_version) + parser.add_argument('dataset_file', help='Dataset file') + parser.add_argument('prediction_file', help='Prediction File') + args = parser.parse_args() + with open(args.dataset_file) as dataset_file: + dataset_json = json.load(dataset_file) + if (dataset_json['version'] != expected_version): + print( + 'Evaluation expects v-' + expected_version + + ', but got dataset with v-' + dataset_json['version'], + file=sys.stderr) + dataset = dataset_json['data'] + + print(args.prediction_file) + with open(args.prediction_file) as prediction_file: + predictions = json.load(prediction_file) + print(json.dumps(evaluate(dataset, predictions))) diff --git a/demo/reading-comprehension/evaluate_v2.py b/demo/reading-comprehension/evaluate_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..8a82e021244929ef80a4c3d3b32dd6820526c76d --- /dev/null +++ b/demo/reading-comprehension/evaluate_v2.py @@ -0,0 +1,163 @@ +"""Official evaluation script for SQuAD version 2.0. + +In addition to basic functionality, we also compute additional statistics and +plot precision-recall curves if an additional na_prob.json file is provided. +This file is expected to map question ID's to the model's predicted probability +that a question is unanswerable. +""" +import argparse +import collections +import json +import numpy as np +import os +import re +import string +import sys + + +def make_qid_to_has_ans(dataset): + qid_to_has_ans = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid_to_has_ans[qa['id']] = bool(qa['answers']) + return qid_to_has_ans + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: return [] + return normalize_answer(s).split() + + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def get_raw_scores(dataset, preds): + exact_scores = {} + f1_scores = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid = qa['id'] + gold_answers = [ + a['text'] for a in qa['answers'] + if normalize_answer(a['text']) + ] + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + if qid not in preds: + print('Missing prediction for %s' % qid) + continue + a_pred = preds[qid] + # Take max over all gold answers + exact_scores[qid] = max( + compute_exact(a, a_pred) for a in gold_answers) + f1_scores[qid] = max( + compute_f1(a, a_pred) for a in gold_answers) + return exact_scores, f1_scores + + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, + qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, + qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, + qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh diff --git a/demo/reading-comprehension/predict.py b/demo/reading-comprehension/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..f0896e4d8bafea2ded539b5b68c9cc9895579bc4 --- /dev/null +++ b/demo/reading-comprehension/predict.py @@ -0,0 +1,525 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import ast +import collections +import json +import io +import math +import numpy as np +import os +import six +import sys +import time + +import paddle +import paddle.fluid as fluid +import paddlehub as hub +import evaluate_v1 +import evaluate_v2 + +hub.common.logger.logger.setLevel("INFO") + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=4e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint.") +parser.add_argument("--result_dir", type=str, default=None, help="Directory to predicted results to be written.") +parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") +parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.") +parser.add_argument("--n_best_size", type=int, default=20, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") +parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") +parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.") +args = parser.parse_args() +# yapf: enable. + + +def write_predictions( + all_examples, + all_features, + all_results, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + n_best_size=20, + max_answer_length=30, + do_lower_case=True, + version_2_with_negative=False, + null_score_diff_threshold=0.0, +): + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + print("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", [ + "feature_index", "start_index", "end_index", "start_logit", + "end_logit" + ]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = get_best_indexes(result.start_logits, n_best_size) + end_indexes = get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[ + 0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:( + pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:( + orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + # debug + if best_non_null_entry is None: + print("Emmm..., sth wrong") + + probs = compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = hub.reader.tokenization.BasicTokenizer( + do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted( + enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +if __name__ == '__main__': + # Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + # module = hub.Module(module_dir=["./bert_uncased_L-12_H-768_A-12.hub_module"]) + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use ReadingComprehensionReader to read dataset + dataset = hub.dataset.SQUAD( + version_2_with_negative=args.version_2_with_negative) + + reader = hub.reader.ReadingComprehensionReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_length=args.max_seq_len, + doc_stride=128, + max_query_length=64) + + # Use "sequence_output" for token-level output. + seq_output = outputs["sequence_output"] + + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Select finetune strategy, setup config and finetune + strategy = hub.AdamWeightDecayStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_proportion=args.warmup_proportion, + lr_scheduler="linear_decay") + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + log_interval=10, + use_pyreader=args.use_pyreader, + use_data_parallel=args.use_data_parallel, + save_ckpt_interval=100, + use_cuda=args.use_gpu, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + enable_memory_optim=True, + strategy=strategy) + + # Define a reading comprehension finetune task by PaddleHub's API + reading_comprehension_task = hub.ReadingComprehensionTask( + data_reader=reader, + feature=seq_output, + feed_list=feed_list, + config=config) + + # Data to be predicted + data = dataset.predict_examples + + features = reader.convert_examples_to_features( + examples=data, is_training=False) + run_states = reading_comprehension_task.predict(data=data) + results = [run_state.run_results for run_state in run_states] + + RawResult = collections.namedtuple( + "RawResult", ["unique_id", "start_logits", "end_logits"]) + all_results = [] + for batch_idx, batch_result in enumerate(results): + np_unique_ids = batch_result[0] + np_start_logits = batch_result[1] + np_end_logits = batch_result[2] + np_num_seqs = batch_result[3] + + for idx in range(np_unique_ids.shape[0]): + unique_id = int(np_unique_ids[idx]) + start_logits = [float(x) for x in np_start_logits[idx].flat] + end_logits = [float(x) for x in np_end_logits[idx].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(args.result_dir, "predictions.json") + output_nbest_file = os.path.join(args.result_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(args.result_dir, "null_odds.json") + + write_predictions( + data, + features, + all_results, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + max_answer_length=args.max_answer_length, + n_best_size=args.n_best_size, + version_2_with_negative=args.version_2_with_negative, + null_score_diff_threshold=args.null_score_diff_threshold) + + with io.open(dataset.predict_file, 'r', encoding="utf8") as dataset_file: + dataset_json = json.load(dataset_file) + dataset = dataset_json['data'] + + with io.open( + output_prediction_file, 'r', encoding="utf8") as prediction_file: + predictions = json.load(prediction_file) + + if not args.version_2_with_negative: + print(json.dumps(evaluate_v1.evaluate(dataset, predictions))) + else: + with io.open( + output_null_log_odds_file, 'r', encoding="utf8") as odds_file: + na_probs = json.load(odds_file) + # Maps qid to true/false + qid_to_has_ans = evaluate_v2.make_qid_to_has_ans(dataset) + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = evaluate_v2.get_raw_scores(dataset, predictions) + exact_thresh = evaluate_v2.apply_no_ans_threshold( + exact_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0) + f1_thresh = evaluate_v2.apply_no_ans_threshold( + f1_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0) + out_eval = evaluate_v2.make_eval_dict(exact_thresh, f1_thresh) + + if has_ans_qids: + has_ans_eval = evaluate_v2.make_eval_dict( + exact_thresh, f1_thresh, qid_list=has_ans_qids) + evaluate_v2.merge_eval(out_eval, has_ans_eval, 'HasAns') + if no_ans_qids: + no_ans_eval = evaluate_v2.make_eval_dict( + exact_thresh, f1_thresh, qid_list=no_ans_qids) + evaluate_v2.merge_eval(out_eval, no_ans_eval, 'NoAns') + + evaluate_v2.find_all_best_thresh(out_eval, predictions, exact_raw, + f1_raw, na_probs, qid_to_has_ans) + print(json.dumps(out_eval, indent=4)) diff --git a/demo/reading-comprehension/reading_comprehension.py b/demo/reading-comprehension/reading_comprehension.py new file mode 100644 index 0000000000000000000000000000000000000000..1feb19e16933e816cfbbb01f0bbb7e4e998351b1 --- /dev/null +++ b/demo/reading-comprehension/reading_comprehension.py @@ -0,0 +1,97 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +import argparse +import ast + +import paddle.fluid as fluid +import paddlehub as hub + +hub.common.logger.logger.setLevel("INFO") + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=1, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.") +parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + # Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use ReadingComprehensionReader to read dataset + dataset = hub.dataset.SQUAD( + version_2_with_negative=args.version_2_with_negative) + + reader = hub.reader.ReadingComprehensionReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_length=args.max_seq_len, + doc_stride=128, + max_query_length=64) + + seq_output = outputs["sequence_output"] + + # Setup feed list for data feeder + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Select finetune strategy, setup config and finetune + strategy = hub.AdamWeightDecayStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_proportion=args.warmup_proportion, + lr_scheduler="linear_decay") + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + log_interval=10, + use_pyreader=args.use_pyreader, + use_data_parallel=args.use_data_parallel, + save_ckpt_interval=1000, + use_cuda=args.use_gpu, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + enable_memory_optim=True, + strategy=strategy) + + # Define a reading comprehension finetune task by PaddleHub's API + reading_comprehension_task = hub.ReadingComprehensionTask( + data_reader=reader, + feature=seq_output, + feed_list=feed_list, + config=config) + + # Finetune by PaddleHub's API + reading_comprehension_task.finetune() diff --git a/demo/reading-comprehension/run_finetune.sh b/demo/reading-comprehension/run_finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d17ce9f04c0de49174057cbfce022e0d51d5c92 --- /dev/null +++ b/demo/reading-comprehension/run_finetune.sh @@ -0,0 +1,15 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0,1 + +python -u reading_comprehension.py \ + --batch_size=12 \ + --use_gpu=True \ + --checkpoint_dir="./ckpt_rc" \ + --learning_rate=3e-5 \ + --weight_decay=0.01 \ + --warmup_proportion=0.1 \ + --num_epoch=2 \ + --max_seq_len=384 \ + --use_pyreader=True \ + --use_data_parallel=True \ + --version_2_with_negative=False diff --git a/demo/reading-comprehension/run_predict.sh b/demo/reading-comprehension/run_predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..87ff3a3cc91593b8763fa023d18b1d882036c585 --- /dev/null +++ b/demo/reading-comprehension/run_predict.sh @@ -0,0 +1,21 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_rc" +RES_DIR="./result" + +mkdir $RES_DIR + +python -u predict.py \ + --batch_size=12 \ + --use_gpu=True \ + --checkpoint_dir=${CKPT_DIR} \ + --learning_rate=3e-5 \ + --weight_decay=0.01 \ + --warmup_proportion=0.1 \ + --num_epoch=1 \ + --max_seq_len=384 \ + --use_pyreader=False \ + --use_data_parallel=False \ + --version_2_with_negative=False \ + --result_dir=${RES_DIR} diff --git a/demo/regression/predict.py b/demo/regression/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..4f0fb9590c326af7ae71054a5e371e42427802b6 --- /dev/null +++ b/demo/regression/predict.py @@ -0,0 +1,113 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import ast +import numpy as np +import os +import time + +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + dataset = None + metrics_choices = [] + # Download dataset and use ClassifyReader to read dataset + if args.dataset.lower() == "sts-b": + dataset = hub.dataset.GLUE("STS-B") + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] + else: + raise ValueError("%s dataset is not defined" % args.dataset) + + support_metrics = ["acc", "f1", "matthews"] + for metric in metrics_choices: + if metric not in support_metrics: + raise ValueError("\"%s\" metric is not defined" % metric) + + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + reader = hub.reader.RegressionReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=False, + use_pyreader=args.use_pyreader, + use_cuda=args.use_gpu, + batch_size=args.batch_size, + enable_memory_optim=False, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) + + # Define a regression finetune task by PaddleHub's API + reg_task = hub.RegressionTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + config=config) + + # Data to be prdicted + data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()] + + index = 0 + run_states = reg_task.predict(data=data) + results = [run_state.run_results for run_state in run_states] + if not os.path.exists("output"): + os.makedirs("output") + fout = open(os.path.join("output", "%s.tsv" % args.dataset.upper()), 'w') + fout.write("index\tprediction") + for batch_result in results: + for result in batch_result[0]: + if index < 3: + print("%s\t%s\tpredict=%.3f" % (data[index][0], data[index][1], + result[0])) + fout.write("\n%s\t%.3f" % (index, result[0])) + index += 1 + fout.close() diff --git a/demo/regression/regression.py b/demo/regression/regression.py new file mode 100644 index 0000000000000000000000000000000000000000..1e6d7372d10b78c3998a2ec3cb1ad28d1c6806fb --- /dev/null +++ b/demo/regression/regression.py @@ -0,0 +1,95 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +import argparse +import ast + +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--dataset", type=str, default="STS-B", help="Directory to model checkpoint") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") +parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + dataset = None + # Download dataset and use ClassifyReader to read dataset + if args.dataset.lower() == "sts-b": + dataset = hub.dataset.GLUE("STS-B") + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + else: + raise ValueError("%s dataset is not defined" % args.dataset) + + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + reader = hub.reader.RegressionReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Select finetune strategy, setup config and finetune + strategy = hub.AdamWeightDecayStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + lr_scheduler="linear_decay") + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_pyreader=args.use_pyreader, + use_cuda=args.use_gpu, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + + # Define a regression finetune task by PaddleHub's API + reg_task = hub.RegressionTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + config=config) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + reg_task.finetune_and_eval() diff --git a/demo/regression/run_predict.sh b/demo/regression/run_predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d0c1ae007ea12803278d29b4a94e15ad757f65b --- /dev/null +++ b/demo/regression/run_predict.sh @@ -0,0 +1,13 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +# export CUDA_VISIBLE_DEVICES=0 + +# User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task +DATASET="STS-B" +CKPT_DIR="./ckpt_${DATASET}" +# STS-B: batch_size=32, max_seq_len=128 + +python -u predict.py --checkpoint_dir $CKPT_DIR \ + --max_seq_len 128 \ + --use_gpu True \ + --dataset=${DATASET} \ + --batch_size=32 \ diff --git a/demo/regression/run_regssion.sh b/demo/regression/run_regssion.sh new file mode 100644 index 0000000000000000000000000000000000000000..c1ed3c1f44cff44677a485d4b45e8e08ba01d262 --- /dev/null +++ b/demo/regression/run_regssion.sh @@ -0,0 +1,19 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +DATASET="STS-B" +CKPT_DIR="./ckpt_${DATASET}" +# Recommending hyper parameters for difference task +# STS-B: batch_size=32, weight_decay=0.1, num_epoch=3, max_seq_len=128, lr=4e-5 + +python -u regression.py \ + --batch_size=32 \ + --use_gpu=True \ + --dataset=${DATASET} \ + --checkpoint_dir=${CKPT_DIR} \ + --learning_rate=4e-5 \ + --weight_decay=0.1 \ + --max_seq_len=128 \ + --num_epoch=3 \ + --use_pyreader=True \ + --use_data_parallel=True \ diff --git a/demo/text-classification/predict.py b/demo/text-classification/predict.py index bc54aff9b9245d9c96bcf027a5c94e00b4fb65ae..3e11f9c3bd97cf53dc4d8413b9e2fa6cad0db6e1 100644 --- a/demo/text-classification/predict.py +++ b/demo/text-classification/predict.py @@ -29,7 +29,7 @@ import paddlehub as hub # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--checkpoint_dir", type=str, default="ckpt_20190802182531", help="Directory to model checkpoint") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") @@ -42,64 +42,89 @@ args = parser.parse_args() if __name__ == '__main__': dataset = None + metrics_choices = [] # Download dataset and use ClassifyReader to read dataset if args.dataset.lower() == "chnsenticorp": dataset = hub.dataset.ChnSentiCorp() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "nlpcc_dbqa": dataset = hub.dataset.NLPCC_DBQA() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "lcqmc": dataset = hub.dataset.LCQMC() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "mrpc": dataset = hub.dataset.GLUE("MRPC") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["f1", "acc"] + # The first metric will be choose to eval. Ref: task.py:799 elif args.dataset.lower() == "qqp": dataset = hub.dataset.GLUE("QQP") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["f1", "acc"] elif args.dataset.lower() == "sst-2": dataset = hub.dataset.GLUE("SST-2") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower() == "cola": dataset = hub.dataset.GLUE("CoLA") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["matthews", "acc"] elif args.dataset.lower() == "qnli": dataset = hub.dataset.GLUE("QNLI") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower() == "rte": dataset = hub.dataset.GLUE("RTE") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset.lower() == "mnli": - dataset = hub.dataset.GLUE("MNLI") + metrics_choices = ["acc"] + elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli_m": + dataset = hub.dataset.GLUE("MNLI_m") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] + elif args.dataset.lower() == "mnli_mm": + dataset = hub.dataset.GLUE("MNLI_mm") + if args.use_taskid: + module = hub.Module(name="ernie_v2_eng_base") + else: + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower().startswith("xnli"): dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:]) module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12") + metrics_choices = ["acc"] else: raise ValueError("%s dataset is not defined" % args.dataset) + support_metrics = ["acc", "f1", "matthews"] + for metric in metrics_choices: + if metric not in support_metrics: + raise ValueError("\"%s\" metric is not defined" % metric) + inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) reader = hub.reader.ClassifyReader( @@ -147,7 +172,8 @@ if __name__ == '__main__': feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, - config=config) + config=config, + metrics_choices=metrics_choices) # Data to be prdicted data = [[d.text_a, d.text_b] for d in dataset.get_dev_examples()[:3]] diff --git a/demo/text-classification/run_classifier.sh b/demo/text-classification/run_classifier.sh index 8016b2a3de342894e36ccbef4d7bb621fecb863a..d1aec212f97f2bbcc22b8f24a16bdce321099e49 100644 --- a/demo/text-classification/run_classifier.sh +++ b/demo/text-classification/run_classifier.sh @@ -2,18 +2,32 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 # User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task - DATASET="chnsenticorp" CKPT_DIR="./ckpt_${DATASET}" -# Support ChnSentiCorp NLPCC_DBQA LCQMC MRPC QQP SST-2 -# CoLA QNLI RTE MNLI XNLI -# for XNLI: Specify the language with an underscore like xnli_zh. -# ar: Arabic bg: Bulgarian de: German -# el: Greek en: English es: Spanish -# fr: French hi: Hindi ru: Russian -# sw: Swahili th: Thai tr: Turkish -# ur: Urdu vi: Vietnamese zh: Chinese (Simplified) +# Recommending hyper parameters for difference task +# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5 +# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5 +# QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# CoLA: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# MRPC: batch_size=32, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# RTE: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=3e-5 +# MNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# Specify the matched/mismatched dev and test dataset with an underscore. +# mnli_m or mnli: dev and test in matched dataset. +# mnli_mm: dev and test in mismatched dataset. +# The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf. +# If you are not sure which one to pick, just use mnli or mnli_m. +# XNLI: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5 +# Specify the language with an underscore like xnli_zh. +# ar- Arabic bg- Bulgarian de- German +# el- Greek en- English es- Spanish +# fr- French hi- Hindi ru- Russian +# sw- Swahili th- Thai tr- Turkish +# ur- Urdu vi- Vietnamese zh- Chinese (Simplified) python -u text_classifier.py \ --batch_size=24 \ diff --git a/demo/text-classification/run_predict.sh b/demo/text-classification/run_predict.sh index c7f3c8ddcad88a643d87f03944d9ef453968a59b..9c17b4e1151419863988e5bbcf0e2cfb695d269f 100644 --- a/demo/text-classification/run_predict.sh +++ b/demo/text-classification/run_predict.sh @@ -2,17 +2,20 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 # User can select chnsenticorp, nlpcc_dbqa, lcqmc and so on for different task - -DATASET="chnsenticorp" -CKPT_DIR="./ckpt_${DATASET}" - # Support ChnSentiCorp NLPCC_DBQA LCQMC MRPC QQP SST-2 -# CoLA QNLI RTE MNLI XNLI +# CoLA QNLI RTE MNLI (or MNLI_m) MNLI_mm) XNLI # for XNLI: Specify the language with an underscore like xnli_zh. # ar: Arabic bg: Bulgarian de: German # el: Greek en: English es: Spanish # fr: French hi: Hindi ru: Russian # sw: Swahili th: Thai tr: Turkish # ur: Urdu vi: Vietnamese zh: Chinese (Simplified) +DATASET="ChnSentiCorp" +CKPT_DIR="./ckpt_${DATASET}" -python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu False --dataset=${DATASET} --use_taskid False +python -u predict.py --checkpoint_dir=$CKPT_DIR \ + --max_seq_len=128 \ + --use_gpu=True \ + --dataset=${DATASET} \ + --batch_size=150 \ + --use_taskid=False \ diff --git a/demo/text-classification/text_classifier.py b/demo/text-classification/text_classifier.py index cb0b3d3612fc57bd8c37257508cd0820438b4d83..1bf1431e78ebf41be9f8447d799463202e811367 100644 --- a/demo/text-classification/text_classifier.py +++ b/demo/text-classification/text_classifier.py @@ -26,7 +26,7 @@ parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whet parser.add_argument("--dataset", type=str, default="chnsenticorp", help="The choice of dataset") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") +parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") @@ -39,64 +39,89 @@ args = parser.parse_args() if __name__ == '__main__': dataset = None + metrics_choices = [] # Download dataset and use ClassifyReader to read dataset if args.dataset.lower() == "chnsenticorp": dataset = hub.dataset.ChnSentiCorp() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "nlpcc_dbqa": dataset = hub.dataset.NLPCC_DBQA() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "lcqmc": dataset = hub.dataset.LCQMC() module = hub.Module(name="ernie") + metrics_choices = ["acc"] elif args.dataset.lower() == "mrpc": dataset = hub.dataset.GLUE("MRPC") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["f1", "acc"] + # The first metric will be choose to eval. Ref: task.py:799 elif args.dataset.lower() == "qqp": dataset = hub.dataset.GLUE("QQP") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["f1", "acc"] elif args.dataset.lower() == "sst-2": dataset = hub.dataset.GLUE("SST-2") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower() == "cola": dataset = hub.dataset.GLUE("CoLA") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["matthews", "acc"] elif args.dataset.lower() == "qnli": dataset = hub.dataset.GLUE("QNLI") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower() == "rte": dataset = hub.dataset.GLUE("RTE") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") - elif args.dataset.lower() == "mnli": - dataset = hub.dataset.GLUE("MNLI") + metrics_choices = ["acc"] + elif args.dataset.lower() == "mnli" or args.dataset.lower() == "mnli": + dataset = hub.dataset.GLUE("MNLI_m") if args.use_taskid: module = hub.Module(name="ernie_v2_eng_base") else: module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] + elif args.dataset.lower() == "mnli_mm": + dataset = hub.dataset.GLUE("MNLI_mm") + if args.use_taskid: + module = hub.Module(name="ernie_v2_eng_base") + else: + module = hub.Module(name="bert_uncased_L-12_H-768_A-12") + metrics_choices = ["acc"] elif args.dataset.lower().startswith("xnli"): dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:]) module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12") + metrics_choices = ["acc"] else: raise ValueError("%s dataset is not defined" % args.dataset) + support_metrics = ["acc", "f1", "matthews"] + for metric in metrics_choices: + if metric not in support_metrics: + raise ValueError("\"%s\" metric is not defined" % metric) + inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) reader = hub.reader.ClassifyReader( @@ -144,7 +169,8 @@ if __name__ == '__main__': feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, - config=config) + config=config, + metrics_choices=metrics_choices) # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically diff --git a/paddlehub/__init__.py b/paddlehub/__init__.py index 9824616f760cdbce200129bddd651da22dbbf5bd..9a95bc4937b846ed187ee6c9416d824debcfff1a 100644 --- a/paddlehub/__init__.py +++ b/paddlehub/__init__.py @@ -50,7 +50,12 @@ from .finetune.task import TextClassifierTask from .finetune.task import ImageClassifierTask from .finetune.task import SequenceLabelTask from .finetune.task import MultiLabelClassifierTask +from .finetune.task import RegressionTask +from .finetune.task import ReadingComprehensionTask from .finetune.config import RunConfig from .finetune.strategy import AdamWeightDecayStrategy from .finetune.strategy import DefaultStrategy from .finetune.strategy import DefaultFinetuneStrategy +from .finetune.strategy import L2SPFinetuneStrategy +from .finetune.strategy import ULMFiTStrategy +from .finetune.strategy import CombinedStrategy diff --git a/paddlehub/autofinetune/__init__.py b/paddlehub/autofinetune/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..be11b01db15040e0997426a9485f2a07d2bf85cf --- /dev/null +++ b/paddlehub/autofinetune/__init__.py @@ -0,0 +1,14 @@ +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlehub/autofinetune/autoft.py b/paddlehub/autofinetune/autoft.py new file mode 100644 index 0000000000000000000000000000000000000000..99c3b3f23b60cb469f969fa11922a6dfbcf70c6c --- /dev/null +++ b/paddlehub/autofinetune/autoft.py @@ -0,0 +1,211 @@ +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from multiprocessing.pool import ThreadPool +import copy +import json +import math +import numpy as np +import six +import time + +from paddlehub.common.logger import logger +from paddlehub.common.utils import mkdir + +if six.PY3: + INF = math.inf +else: + INF = float("inf") + + +class PSHE2(object): + def __init__( + self, + evaluator, + cudas=["0"], + popsize=5, + output_dir=None, + alpha=0.5, + epsilon=0.2, + ): + + self._num_thread = len(cudas) + self._popsize = popsize + self._alpha = alpha + self._epsilon = epsilon + self._iteration = 0 + self.cudas = cudas + self.is_cuda_free = {"free": [], "busy": []} + self.is_cuda_free["free"] = cudas + + self.evaluator = evaluator + self.init_input = evaluator.get_init_params() + self.num_hparm = len(self.init_input) + + self.best_hparams_per_pop = [[0] * self.num_hparm] * self._popsize + self.best_reward_per_pop = [INF] * self._popsize + self.momentums = [[0] * self.num_hparm] * self._popsize + self.best_hparms_all_pop = [] + self.best_reward_all_pop = INF + self.current_hparams = [[0] * self.num_hparm] * self._popsize + for i in range(self.popsize): + self.current_hparams[i] = self.randomSolution() + + if output_dir is None: + now = int(time.time()) + time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now)) + self._output_dir = "output_" + time_str + else: + self._output_dir = output_dir + + @property + def thread(self): + return self._num_thread + + @property + def popsize(self): + return self._popsize + + @property + def alpha(self): + return self._alpha + + @property + def epsilon(self): + return self._epsilon + + @property + def output_dir(self): + return self._output_dir + + @property + def iteration(self): + return self._iteration + + def set_output_dir(self, output_dir=None): + if output_dir is not None: + output_dir = output_dir + else: + output_dir = self._output_dir + return output_dir + + def randomSolution(self): + solut = [0] * self.num_hparm + for i in range(self.num_hparm): + ratio = (np.random.random_sample() - 0.5) * 2.0 + if ratio >= 0: + solut[i] = ( + 1.0 - self.init_input[i]) * ratio + self.init_input[i] + else: + solut[i] = ( + self.init_input[i] + 1.0) * ratio + self.init_input[i] + return solut + + def smallPeturb(self): + for i in range(self.popsize): + for j in range(self.num_hparm): + ratio = (np.random.random_sample() - 0.5) * 2.0 + if ratio >= 0: + self.current_hparams[i][j] = ( + 1.0 - self.current_hparams[i][j] + ) * ratio * self.epsilon + self.current_hparams[i][j] + else: + self.current_hparams[i][j] = ( + self.current_hparams[i][j] + + 1.0) * ratio * self.epsilon + self.current_hparams[i][j] + + def estimatePopGradients(self): + gradients = [[0] * self.num_hparm] * self.popsize + for i in range(self.popsize): + for j in range(self.num_hparm): + gradients[i][j] = self.current_hparams[i][ + j] - self.best_hparms_all_pop[j] + return gradients + + def estimateLocalGradients(self): + gradients = [[0] * self.num_hparm] * self.popsize + for i in range(self.popsize): + for j in range(self.num_hparm): + gradients[i][j] = self.current_hparams[i][ + j] - self.best_hparams_per_pop[i][j] + return gradients + + def estimateMomemtum(self): + popGrads = self.estimatePopGradients() + localGrads = self.estimateLocalGradients() + for i in range(self.popsize): + for j in range(self.num_hparm): + self.momentums[i][j] = ( + 1 - 3.0 * self.alpha / self.iteration + ) * self.momentums[i][j] - self.alpha * localGrads[i][ + j] - self.alpha * popGrads[i][j] + + def is_stop(self): + return False + + def solutions(self): + return self.current_hparams + + def feedback(self, params_list, reward_list): + self._iteration = self._iteration + 1 + for i in range(self.popsize): + if reward_list[i] < self.best_reward_per_pop[i]: + self.best_hparams_per_pop[i] = copy.deepcopy( + self.current_hparams[i]) + self.best_reward_per_pop[i] = reward_list[i] + if reward_list[i] < self.best_reward_all_pop: + self.best_hparms_all_pop = self.current_hparams[i] + self.best_reward_all_pop = reward_list[i] + self.estimateMomemtum() + for i in range(self.popsize): + for j in range(len(self.init_input)): + self.current_hparams[i][j] = self.current_hparams[i][ + j] + self.alpha * self.momentums[i][j] + self.smallPeturb() + + def optimal_solution(self): + return self.best_hparms_all_pop + + def step(self, output_dir): + solutions = self.solutions() + + params_cudas_dirs = [] + solution_results = [] + cnt = 0 + solutions_ckptdirs = {} + mkdir(output_dir) + for idx, solution in enumerate(solutions): + cuda = self.is_cuda_free["free"][0] + ckptdir = output_dir + "/ckpt-" + str(idx) + log_file = output_dir + "/log-" + str(idx) + ".info" + params_cudas_dirs.append([solution, cuda, ckptdir, log_file]) + solutions_ckptdirs[tuple(solution)] = ckptdir + self.is_cuda_free["free"].remove(cuda) + self.is_cuda_free["busy"].append(cuda) + if len(params_cudas_dirs) == self.thread or cnt == int( + self.popsize / self.thread): + tp = ThreadPool(len(params_cudas_dirs)) + solution_results += tp.map(self.evaluator.run, + params_cudas_dirs) + cnt += 1 + tp.close() + tp.join() + for param_cuda in params_cudas_dirs: + self.is_cuda_free["free"].append(param_cuda[1]) + self.is_cuda_free["busy"].remove(param_cuda[1]) + params_cudas_dirs = [] + + self.feedback(solutions, solution_results) + + return solutions_ckptdirs diff --git a/paddlehub/autofinetune/evaluator.py b/paddlehub/autofinetune/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd4323307e2e54f37c4ec30d57dd24696a57762 --- /dev/null +++ b/paddlehub/autofinetune/evaluator.py @@ -0,0 +1,211 @@ +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import hashlib +import math +import os +import random +import six +import yaml + +from paddlehub.common.logger import logger +from paddlehub.common.utils import is_windows + +REWARD_SUM = 10000 + +if six.PY3: + INF = math.inf +else: + INF = float("inf") + + +class BaseEvaluator(object): + def __init__(self, params_file, finetunee_script): + with io.open(params_file, 'r', encoding='utf8') as f: + self.params = yaml.safe_load(f) + self.finetunee_script = finetunee_script + + def get_init_params(self): + init_params = [] + for param in self.params["param_list"]: + init_params.append(param['init_value']) + init_params = self.inverse_convert_params(init_params) + return init_params + + def get_reward(self, result_output): + return REWARD_SUM - float(result_output) + + def is_valid_params(self, params): + for i in range(0, len(self.params["param_list"])): + if params[i] < float(self.params["param_list"][i]["greater_than"]): + return False + if params[i] > float(self.params["param_list"][i]["lower_than"]): + return False + return True + + def convert_params(self, params): + cparams = [] + for i in range(0, len(self.params["param_list"])): + cparams.append( + float(self.params["param_list"][i]["greater_than"] + + (params[i] + 1.0) / 2.0 * + (self.params["param_list"][i]["lower_than"] - + self.params["param_list"][i]["greater_than"]))) + if cparams[i] <= float( + self.params["param_list"][i]["greater_than"]): + cparams[i] = float(self.params["param_list"][i]["greater_than"]) + if cparams[i] >= float(self.params["param_list"][i]["lower_than"]): + cparams[i] = float(self.params["param_list"][i]["lower_than"]) + if self.params["param_list"][i]["type"] == "int": + cparams[i] = int(cparams[i]) + return cparams + + def inverse_convert_params(self, params): + cparams = [] + for i in range(0, len(self.params["param_list"])): + cparams.append( + float( + -1.0 + 2.0 * + (params[i] - self.params["param_list"][i]["greater_than"]) / + (self.params["param_list"][i]["lower_than"] - + self.params["param_list"][i]["greater_than"]))) + if cparams[i] <= -1.0: + cparams[i] = -1.0 + if cparams[i] >= 1.0: + cparams[i] = 1.0 + return cparams + + def format_params_str(self, params): + param_str = "--%s=%s" % (self.params["param_list"][0]["name"], + params[0]) + for i in range(1, len(self.params["param_list"])): + param_str = "%s --%s=%s" % ( + param_str, self.params["param_list"][i]["name"], str(params[i])) + return param_str + + def run(self, *args): + raise NotImplementedError + + def new_round(self): + pass + + +class FullTrailEvaluator(BaseEvaluator): + def __init__(self, params_file, finetunee_script): + super(FullTrailEvaluator, self).__init__(params_file, finetunee_script) + + def run(self, *args): + params = args[0][0] + num_cuda = args[0][1] + ckpt_dir = args[0][2] + log_file = args[0][3] + params = self.convert_params(params) + if not self.is_valid_params(params): + return REWARD_SUM + + param_str = self.format_params_str(params) + f = open(log_file, "w") + f.close() + + if is_windows(): + run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file) + else: + run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file) + + try: + os.system(run_cmd) + with open(log_file, "r") as f: + lines = f.readlines() + eval_result = lines[-1] + except: + print( + "WARNING: Program which was ran with hyperparameters as %s was crashed!" + % param_str.replace("--", "")) + eval_result = 0.0 + reward = self.get_reward(eval_result) + self.model_rewards[ckpt_dir] = reward + return reward + + +class ModelBasedEvaluator(BaseEvaluator): + def __init__(self, params_file, finetunee_script): + super(ModelBasedEvaluator, self).__init__(params_file, finetunee_script) + self.model_rewards = {} + self.half_best_model_ckpt = [] + self.run_count = 0 + + def run(self, *args): + params = args[0][0] + num_cuda = args[0][1] + ckpt_dir = args[0][2] + log_file = args[0][3] + params = self.convert_params(params) + if not self.is_valid_params(params): + return REWARD_SUM + + param_str = self.format_params_str(params) + f = open(log_file, "w") + f.close() + + if len(self.half_best_model_ckpt) > 0: + model_path = self.half_best_model_ckpt[self.run_count % len( + self.half_best_model_ckpt)] + "/best_model" + if is_windows(): + run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --epochs=1 --model_path %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, model_path, ckpt_dir, param_str, log_file) + else: + run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --epochs=1 --model_path %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, model_path, ckpt_dir, param_str, log_file) + else: + if is_windows(): + run_cmd = "set FLAGS_eager_delete_tensor_gb=0.0&set CUDA_VISIBLE_DEVICES=%s&python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file) + else: + run_cmd = "export FLAGS_eager_delete_tensor_gb=0.0; export CUDA_VISIBLE_DEVICES=%s; python -u %s --checkpoint_dir=%s %s >%s 2>&1" % \ + (num_cuda, self.finetunee_script, ckpt_dir, param_str, log_file) + + self.run_count += 1 + try: + os.system(run_cmd) + with open(log_file, "r") as f: + lines = f.readlines() + eval_result = lines[-1] + except: + print( + "WARNING: Program which was ran with hyperparameters as %s was crashed!" + % param_str.replace("--", "")) + eval_result = 0.0 + reward = self.get_reward(eval_result) + self.model_rewards[ckpt_dir] = reward + return reward + + def new_round(self): + """update self.half_best_model""" + half_size = int(len(self.model_rewards) / 2) + if half_size < 1: + half_size = 1 + self.half_best_model_ckpt = list({ + key + for key in sorted( + self.model_rewards, key=self.model_rewards.get, reverse=False) + [:half_size] + }) + self.model_rewards = {} diff --git a/paddlehub/commands/__init__.py b/paddlehub/commands/__init__.py index b1159d54f4f1cddbc6da7546500690ac23921098..a8257a9466c9f6eeb95b847a62631173287e8527 100644 --- a/paddlehub/commands/__init__.py +++ b/paddlehub/commands/__init__.py @@ -25,3 +25,4 @@ from . import help from . import clear from . import config from . import hub +from . import autofinetune diff --git a/paddlehub/commands/autofinetune.py b/paddlehub/commands/autofinetune.py new file mode 100644 index 0000000000000000000000000000000000000000..5d352774218f664aae37e21bfaad7ec8b4d6e434 --- /dev/null +++ b/paddlehub/commands/autofinetune.py @@ -0,0 +1,167 @@ +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import io +import json +import os +import sys +import ast + +import six +import pandas +import numpy as np + +from paddlehub.commands.base_command import BaseCommand, ENTRY +from paddlehub.common.arg_helper import add_argument, print_arguments +from paddlehub.autofinetune.autoft import PSHE2 +from paddlehub.autofinetune.evaluator import FullTrailEvaluator +from paddlehub.autofinetune.evaluator import ModelBasedEvaluator +from paddlehub.common.logger import logger + +import paddlehub as hub + + +class AutoFineTuneCommand(BaseCommand): + name = "autofinetune" + + def __init__(self, name): + super(AutoFineTuneCommand, self).__init__(name) + self.show_in_help = True + self.name = name + self.description = "Paddlehub helps to finetune a task by searching hyperparameters automatically." + self.parser = argparse.ArgumentParser( + description=self.__class__.__doc__, + prog='%s %s ' % (ENTRY, + self.name), + usage='%(prog)s', + add_help=False) + self.module = None + + def add_params_file_arg(self): + self.arg_params_to_be_searched_group.add_argument( + "--param_file", + type=str, + default=None, + required=True, + help= + "Hyperparameters to be searched in the yaml format. The number of hyperparameters searched must be greater than 1." + ) + + def add_autoft_config_arg(self): + self.arg_config_group.add_argument( + "--popsize", type=int, default=5, help="Population size") + self.arg_config_group.add_argument( + "--cuda", + type=ast.literal_eval, + default=['0'], + help="The list of gpu devices to be used") + self.arg_config_group.add_argument( + "--round", type=int, default=10, help="Number of searches") + self.arg_config_group.add_argument( + "--output_dir", + type=str, + default=None, + help="Directory to model checkpoint") + self.arg_config_group.add_argument( + "--evaluate_choice", + type=str, + default="fulltrail", + help="Choices: fulltrail or modelbased.") + + def execute(self, argv): + if not argv: + print("ERROR: Please specify a script to be finetuned in python.\n") + self.help() + return False + + self.fintunee_script = argv[0] + + self.parser.prog = '%s %s %s' % (ENTRY, self.name, self.fintunee_script) + self.arg_params_to_be_searched_group = self.parser.add_argument_group( + title="Input options", + description="Hyperparameters to be searched.") + self.arg_config_group = self.parser.add_argument_group( + title="Autofinetune config options", + description= + "Autofintune configuration for controlling autofinetune behavior, not required" + ) + + self.add_params_file_arg() + self.add_autoft_config_arg() + + if not argv[1:]: + self.help() + return False + + self.args = self.parser.parse_args(argv[1:]) + if self.args.evaluate_choice.lower() == "fulltrail": + evaluator = FullTrailEvaluator(self.args.param_file, + self.fintunee_script) + elif self.args.evaluate_choice.lower() == "modelbased": + evaluator = ModelBasedEvaluator(self.args.param_file, + self.fintunee_script) + else: + raise ValueError( + "The evaluate %s is not defined!" % self.args.evaluate_choice) + + autoft = PSHE2( + evaluator, + cudas=self.args.cuda, + popsize=self.args.popsize, + output_dir=self.args.output_dir) + + run_round_cnt = 0 + solutions_ckptdirs = {} + print("PaddleHub Autofinetune starts.") + while (not autoft.is_stop()) and run_round_cnt < self.args.round: + print("PaddleHub Autofinetune starts round at %s." % run_round_cnt) + output_dir = autoft._output_dir + "/round" + str(run_round_cnt) + res = autoft.step(output_dir) + solutions_ckptdirs.update(res) + evaluator.new_round() + run_round_cnt = run_round_cnt + 1 + print("PaddleHub Autofinetune ends.") + with open("./log_file.txt", "w") as f: + best_choice = evaluator.convert_params(autoft.optimal_solution()) + print("The best hyperparameters:") + f.write("The best hyperparameters:\n") + param_name = [] + for idx, param in enumerate(evaluator.params["param_list"]): + param_name.append(param["name"]) + f.write(param["name"] + "\t:\t" + str(best_choice[idx]) + "\n") + print("%s : %s" % (param["name"], best_choice[idx])) + f.write("\n\n\n") + f.write("\t".join(param_name) + "\toutput_dir\n\n") + + logger.info( + "The checkpont directory of programs ran with paramemters searched are saved as log_file.txt ." + ) + print( + "The checkpont directory of programs ran with paramemters searched are saved as log_file.txt ." + ) + for solution, ckptdir in solutions_ckptdirs.items(): + param = evaluator.convert_params(solution) + param = [str(p) for p in param] + f.write("\t".join(param) + "\t" + ckptdir + "\n\n") + + return True + + +command = AutoFineTuneCommand.instance() diff --git a/paddlehub/commands/install.py b/paddlehub/commands/install.py index fc444b19ada6ab39d4c449b4a63535c972962a66..73d8b2ab1e4f2e8fc3735ff7719ac12a0c39cadb 100644 --- a/paddlehub/commands/install.py +++ b/paddlehub/commands/install.py @@ -36,7 +36,6 @@ class InstallCommand(BaseCommand): prog='%s %s ' % (ENTRY, name), usage='%(prog)s', add_help=False) - #TODO(wuzewu): add --upgrade option def execute(self, argv): if not argv: diff --git a/paddlehub/common/dir.py b/paddlehub/common/dir.py index bb665af64a1fab3697283449e514b8a792efe34b..552990b4d9d87ad7fa176d26c353c634d6a260b2 100644 --- a/paddlehub/common/dir.py +++ b/paddlehub/common/dir.py @@ -15,7 +15,6 @@ import os -# TODO: Change dir.py's filename, this naming rule is not qualified USER_HOME = os.path.expanduser('~') HUB_HOME = os.path.join(USER_HOME, ".paddlehub") MODULE_HOME = os.path.join(HUB_HOME, "modules") diff --git a/paddlehub/common/downloader.py b/paddlehub/common/downloader.py index 8aeb9781f91c42d46b20e2db20acf445d21b0658..28b42732c5d70133e503559306e466f5ff537d96 100644 --- a/paddlehub/common/downloader.py +++ b/paddlehub/common/downloader.py @@ -77,7 +77,6 @@ class Downloader(object): with open(file_name, 'wb') as f: shutil.copyfileobj(r.raw, f) else: - #TODO(ZeyuChen) upgrade to tqdm process with open(file_name, 'wb') as f: dl = 0 total_length = int(total_length) diff --git a/paddlehub/common/hub_server.py b/paddlehub/common/hub_server.py index 15b360fd6a30c0503e2c930a1c3bee0a35367b2b..8e1169470c1cdb6c91297eba187bddc4f8c33c9b 100644 --- a/paddlehub/common/hub_server.py +++ b/paddlehub/common/hub_server.py @@ -24,6 +24,7 @@ import requests import json import yaml import random +import fcntl from random import randint from paddlehub.common import utils, srv_utils @@ -38,6 +39,9 @@ CACHE_TIME = 60 * 10 class HubServer(object): def __init__(self, config_file_path=None): + LOCK_FILE = os.path.join(hub.HUB_HOME, '__LOCK__') + LOCK_FP = open(LOCK_FILE, 'a+') + fcntl.flock(LOCK_FP.fileno(), fcntl.LOCK_EX) if not config_file_path: config_file_path = os.path.join(hub.CONF_HOME, 'config.json') if not os.path.exists(hub.CONF_HOME): @@ -53,6 +57,7 @@ class HubServer(object): self.server_url = self.config['server_url'] self.request() self._load_resource_list_file_if_valid() + LOCK_FP.close() def get_server_url(self): random.seed(int(time.time())) @@ -178,7 +183,6 @@ class HubServer(object): self.resource_list_file['version'][index] for index in resource_index_list ] - #TODO(wuzewu): version sort method resource_version_list = sorted(resource_version_list) if not version: if not resource_version_list: diff --git a/paddlehub/common/paddle_helper.py b/paddlehub/common/paddle_helper.py index 0fdb62c1405363e154afe6f3ab8b46401686fb93..751bdd7246998069684fb3dd5a7bc659c8b94cfe 100644 --- a/paddlehub/common/paddle_helper.py +++ b/paddlehub/common/paddle_helper.py @@ -83,7 +83,6 @@ def from_param_to_module_attr(param, module_attr): module_attr.map.data['trainable']) from_pyobj_to_module_attr(param.do_model_average, module_attr.map.data['do_model_average']) - #TODO(wuzewu): don't save learning rate from_pyobj_to_module_attr(param.optimize_attr, module_attr.map.data['optimize_attr']) from_pyobj_to_module_attr( diff --git a/paddlehub/common/utils.py b/paddlehub/common/utils.py index 00e43fba096856bd9e4e8c7f2474f3cf39fa2af9..f4c7e0c1a8bae5aa7f65a83279681244e86ddb26 100644 --- a/paddlehub/common/utils.py +++ b/paddlehub/common/utils.py @@ -117,7 +117,6 @@ def get_pykey(key, keyed_type): return str(key) -#TODO(wuzewu): solving the problem of circular references def from_pyobj_to_module_attr(pyobj, module_attr, obj_filter=None): if obj_filter and obj_filter(pyobj): return diff --git a/paddlehub/dataset/__init__.py b/paddlehub/dataset/__init__.py index bed8a5d1ffbb39dc3f3212ff4a8c0b53a782895a..801564147ed74437616d5b54664b97bc706d888c 100644 --- a/paddlehub/dataset/__init__.py +++ b/paddlehub/dataset/__init__.py @@ -20,6 +20,7 @@ from .msra_ner import MSRA_NER from .nlpcc_dbqa import NLPCC_DBQA from .lcqmc import LCQMC from .toxic import Toxic +from .squad import SQUAD from .xnli import XNLI from .glue import GLUE diff --git a/paddlehub/dataset/base_cv_dataset.py b/paddlehub/dataset/base_cv_dataset.py index 898b649947bb510daa6fd81d601597792412a752..8f1258ef87bc10348896aeab91760ff8b9a165a7 100644 --- a/paddlehub/dataset/base_cv_dataset.py +++ b/paddlehub/dataset/base_cv_dataset.py @@ -52,36 +52,35 @@ class ImageClassificationDataset(object): return dataset_path def _parse_data(self, data_path, shuffle=False, phase=None): - def _base_reader(): - data = [] - with open(data_path, "r") as file: - while True: - line = file.readline() - if not line: - break - line = line.strip() - items = line.split(" ") - if len(items) > 2: - image_path = " ".join(items[0:-1]) - else: - image_path = items[0] - if not os.path.isabs(image_path): - if self.base_path is not None: - image_path = os.path.join(self.base_path, - image_path) - label = items[-1] - data.append((image_path, items[-1])) - - if phase == 'train': - self.train_examples = data - elif phase == 'dev': - self.dev_examples = data - elif phase == 'test': - self.test_examples = data - - if shuffle: - np.random.shuffle(data) + data = [] + with open(data_path, "r") as file: + while True: + line = file.readline() + if not line: + break + line = line.strip() + items = line.split(" ") + if len(items) > 2: + image_path = " ".join(items[0:-1]) + else: + image_path = items[0] + if not os.path.isabs(image_path): + if self.base_path is not None: + image_path = os.path.join(self.base_path, image_path) + label = items[-1] + data.append((image_path, items[-1])) + + if phase == 'train': + self.train_examples = data + elif phase == 'dev': + self.dev_examples = data + elif phase == 'test': + self.test_examples = data + + if shuffle: + np.random.shuffle(data) + def _base_reader(): for item in data: yield item diff --git a/paddlehub/dataset/glue.py b/paddlehub/dataset/glue.py index f8c0240c3b1d30c394f5e313e83765b727d187b8..4359f9dfbbeb13801dab937bb1ff6be301527e6a 100644 --- a/paddlehub/dataset/glue.py +++ b/paddlehub/dataset/glue.py @@ -39,11 +39,18 @@ class GLUE(HubDataset): def __init__(self, sub_dataset='SST-2'): # sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B if sub_dataset not in [ - 'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B' + 'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP', + 'RTE', 'SST-2', 'STS-B' ]: raise Exception( sub_dataset + " is not in GLUE benchmark. Please confirm the data set") + self.mismatch = False + if sub_dataset == 'MNLI_mm': + sub_dataset = 'MNLI' + self.mismatch = True + elif sub_dataset == 'MNLI_m': + sub_dataset = 'MNLI' self.sub_dataset = sub_dataset self.dataset_dir = os.path.join(DATA_HOME, "glue_data") @@ -64,9 +71,12 @@ class GLUE(HubDataset): self.train_examples = self._read_tsv(self.train_file) def _load_dev_examples(self): - if self.sub_dataset == 'MNLI': + if self.sub_dataset == 'MNLI' and not self.mismatch: self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset, "dev_matched.tsv") + elif self.sub_dataset == 'MNLI' and self.mismatch: + self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset, + "dev_mismatched.tsv") else: self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset, "dev.tsv") @@ -76,9 +86,12 @@ class GLUE(HubDataset): self.test_examples = [] def _load_predict_examples(self): - if self.sub_dataset == 'MNLI': + if self.sub_dataset == 'MNLI' and not self.mismatch: self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset, "test_matched.tsv") + elif self.sub_dataset == 'MNLI' and self.mismatch: + self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset, + "test_mismatched.tsv") else: self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset, "test.tsv") @@ -187,7 +200,7 @@ class GLUE(HubDataset): seq_id += 1 examples.append(example) except: - print("[Discard Incorrect Data] " + "\t".join(line)) + logger.info("[Discard Incorrect Data] " + "\t".join(line)) return examples diff --git a/paddlehub/dataset/squad.py b/paddlehub/dataset/squad.py new file mode 100644 index 0000000000000000000000000000000000000000..4294cd0696fcb36d6ff6b54f337d3f4eef417afe --- /dev/null +++ b/paddlehub/dataset/squad.py @@ -0,0 +1,206 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +import json +import os +import sys + +from paddlehub.reader import tokenization +from paddlehub.common.downloader import default_downloader +from paddlehub.common.dir import DATA_HOME +from paddlehub.common.logger import logger + +_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % (tokenization.printable_text( + self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class SQUAD(object): + """A single set of features of data.""" + + def __init__(self, version_2_with_negative=False): + self.dataset_dir = os.path.join(DATA_HOME, "squad_data") + if not os.path.exists(self.dataset_dir): + ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( + url=_DATA_URL, save_path=DATA_HOME, print_progress=True) + else: + logger.info("Dataset {} already cached.".format(self.dataset_dir)) + + self._load_train_examples(version_2_with_negative, is_training=True) + self._load_predict_examples(version_2_with_negative, is_training=False) + + def _load_train_examples(self, + version_2_with_negative=False, + is_training=True): + if not version_2_with_negative: + self.train_file = os.path.join(self.dataset_dir, "train-v1.1.json") + else: + self.train_file = os.path.join(self.dataset_dir, "train-v2.0.json") + + self.train_examples = self._read_json(self.train_file, is_training, + version_2_with_negative) + + def _load_predict_examples(self, + version_2_with_negative=False, + is_training=False): + if not version_2_with_negative: + self.predict_file = os.path.join(self.dataset_dir, "dev-v1.1.json") + else: + self.predict_file = os.path.join(self.dataset_dir, "dev-v2.0.json") + + self.predict_examples = self._read_json(self.predict_file, is_training, + version_2_with_negative) + + def get_train_examples(self): + return self.train_examples + + def get_dev_examples(self): + return [] + + def get_test_examples(self): + return [] + + def _read_json(self, input_file, is_training, + version_2_with_negative=False): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( + c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer." + ) + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[ + answer_offset + answer_length - 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize( + orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning( + "Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +if __name__ == "__main__": + ds = SQUAD(version_2_with_negative=True) + examples = ds.get_dev_examples() + for index, e in enumerate(examples): + if index < 10: + print(e) diff --git a/paddlehub/dataset/xnli.py b/paddlehub/dataset/xnli.py index 45aefd40118edff2975dfb532e2107032cd69eb8..4fd4461a489d10eded06dc9c6ffd889fe313262b 100644 --- a/paddlehub/dataset/xnli.py +++ b/paddlehub/dataset/xnli.py @@ -43,6 +43,7 @@ class XNLI(HubDataset): "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", "ur", "vi", "zh" ]: + raise Exception(language + "is not in XNLI. Please confirm the language") self.language = language diff --git a/paddlehub/finetune/checkpoint.proto b/paddlehub/finetune/checkpoint.proto index f568313a9299a3c90e48ae4847f44813d7a1b838..b9e4bdae9e829f65fa8cbf97ab921476ef88311b 100644 --- a/paddlehub/finetune/checkpoint.proto +++ b/paddlehub/finetune/checkpoint.proto @@ -22,4 +22,5 @@ message CheckPoint { int64 current_epoch = 1; int64 global_step = 2; string latest_model_dir = 3; + double best_score = 4; } diff --git a/paddlehub/finetune/checkpoint.py b/paddlehub/finetune/checkpoint.py index 84348e314da4ac9892668499a747e79dde5879a7..b3ace5b63042349331d91d00d72617fd9c65d074 100644 --- a/paddlehub/finetune/checkpoint.py +++ b/paddlehub/finetune/checkpoint.py @@ -37,6 +37,7 @@ def load_checkpoint(checkpoint_dir, exe, main_program): ckpt.ParseFromString(f.read()) current_epoch = 1 global_step = 0 + best_score = -999 def if_exist(var): return os.path.exists(os.path.join(ckpt.latest_model_dir, var.name)) @@ -45,20 +46,27 @@ def load_checkpoint(checkpoint_dir, exe, main_program): fluid.io.load_vars( exe, ckpt.latest_model_dir, main_program, predicate=if_exist) + # Compatible with older versions without best_score in checkpoint_pb2 + try: + best_score = ckpt.best_score + except: + best_score = -999 + logger.info("PaddleHub model checkpoint loaded. current_epoch={}, " - "global_step={}".format(ckpt.current_epoch, - ckpt.global_step)) - return True, ckpt.current_epoch, ckpt.global_step + "global_step={}, best_score={:.5f}".format( + ckpt.current_epoch, ckpt.global_step, best_score)) + + return True, ckpt.current_epoch, ckpt.global_step, best_score - logger.info( - "PaddleHub model checkpoint not found, start training from scratch...") + logger.info("PaddleHub model checkpoint not found, start from scratch...") - return False, current_epoch, global_step + return False, current_epoch, global_step, best_score def save_checkpoint(checkpoint_dir, current_epoch, global_step, + best_score, exe, main_program=fluid.default_main_program()): @@ -73,5 +81,6 @@ def save_checkpoint(checkpoint_dir, ckpt.current_epoch = current_epoch ckpt.global_step = global_step ckpt.latest_model_dir = model_saved_dir + ckpt.best_score = best_score with open(ckpt_meta_path, "wb") as f: f.write(ckpt.SerializeToString()) diff --git a/paddlehub/finetune/checkpoint_pb2.py b/paddlehub/finetune/checkpoint_pb2.py index e03832241452af82aecc2430fd968ba1c9542610..20c193fb2125ca94832fc80260e8c228683412a0 100644 --- a/paddlehub/finetune/checkpoint_pb2.py +++ b/paddlehub/finetune/checkpoint_pb2.py @@ -1,4 +1,3 @@ -#coding:utf-8 # Generated by the protocol buffer compiler. DO NOT EDIT! # source: checkpoint.proto @@ -18,7 +17,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( package='paddlehub.task.checkpoint', syntax='proto3', serialized_pb=_b( - '\n\x10\x63heckpoint.proto\x12\x19paddlehub.task.checkpoint\"R\n\nCheckPoint\x12\x15\n\rcurrent_epoch\x18\x01 \x01(\x03\x12\x13\n\x0bglobal_step\x18\x02 \x01(\x03\x12\x18\n\x10latest_model_dir\x18\x03 \x01(\tB\x02H\x03\x62\x06proto3' + '\n\x10\x63heckpoint.proto\x12\x19paddlehub.task.checkpoint\"f\n\nCheckPoint\x12\x15\n\rcurrent_epoch\x18\x01 \x01(\x03\x12\x13\n\x0bglobal_step\x18\x02 \x01(\x03\x12\x18\n\x10latest_model_dir\x18\x03 \x01(\t\x12\x12\n\nbest_score\x18\x04 \x01(\x01\x42\x02H\x03\x62\x06proto3' )) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -77,6 +76,22 @@ _CHECKPOINT = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='best_score', + full_name='paddlehub.task.checkpoint.CheckPoint.best_score', + index=3, + number=4, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), ], extensions=[], nested_types=[], @@ -87,7 +102,7 @@ _CHECKPOINT = _descriptor.Descriptor( extension_ranges=[], oneofs=[], serialized_start=47, - serialized_end=129, + serialized_end=149, ) DESCRIPTOR.message_types_by_name['CheckPoint'] = _CHECKPOINT diff --git a/paddlehub/finetune/evaluate.py b/paddlehub/finetune/evaluate.py index 6ef89466b5f057e68cef2e525f955fefc3cfdc32..d6033f155295248c51e2adadce2ce3738b688c24 100644 --- a/paddlehub/finetune/evaluate.py +++ b/paddlehub/finetune/evaluate.py @@ -128,3 +128,75 @@ def calculate_f1(num_label, num_infer, num_correct): else: f1 = 2 * precision * recall / (precision + recall) return precision, recall, f1 + + +def calculate_f1_np(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + + tp = np.sum((labels == 1) & (preds == 1)) + tn = np.sum((labels == 0) & (preds == 0)) + fp = np.sum((labels == 0) & (preds == 1)) + fn = np.sum((labels == 1) & (preds == 0)) + p = tp / (tp + fp) if (tp + fp) else 0 + r = tp / (tp + fn) if (tp + fn) else 0 + f1 = (2 * p * r) / (p + r) if p + r else 0 + return f1 + + +def matthews_corrcoef(preds, labels): + preds = np.array(preds) + labels = np.array(labels) + + tp = np.sum((labels == 1) & (preds == 1)) + tn = np.sum((labels == 0) & (preds == 0)) + fp = np.sum((labels == 0) & (preds == 1)) + fn = np.sum((labels == 1) & (preds == 0)) + + div = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + mcc = ((tp * tn) - (fp * fn)) / np.sqrt(div) if div else 0 + return mcc + + +def recall_nk(data, n, k, m): + ''' + This metric can be used to evaluate whether the model can find the correct response B for question A + Note: Only applies to each question A only has one correct response B1. + + Parameters + ---------- + data: List. Each element is a tuple, consist of the positive probability of the sample prediction and its label. + For each example, the only one true positive sample should be the first tuple. + n: int. The number of labels per example. + eg: [A,B1,1], [A,B2,0], [A,B3,0] n=3 as there has 3 labels for example A + k: int. If the top k is right, the example will be considered right. + eg: [A,B1,1]=0.5, [A,B2,0]=0.8, [A,B3,0]=0.3(Probability of 1) + If k=2, the prediction for the example A will be considered correct as 0.5 is the top2 Probability + If k=1, the prediction will be considered wrong as 0.5 is not the biggest probability. + m: int. For every m examples, there's going to be a positive sample. + eg. data= [A1,B1,1], [A1,B2,0], [A1,B3,0], [A2,B1,1], [A2,B2,0], [A2,B3,0] + For every 3 examples, there will be one positive sample. so m=3, and n can be 1,2 or 3. + ''' + + def get_p_at_n_in_m(data, n, k, ind): + """ + calculate precision in recall n + """ + pos_score = data[ind][0] + curr = data[ind:ind + n] + curr = sorted(curr, key=lambda x: x[0], reverse=True) + if curr[k - 1][0] <= pos_score: + return 1 + return 0 + + correct_num = 0.0 + + length = len(data) // m + + for i in range(0, length): + ind = i * m + assert data[ind][1] == 1 + + correct_num += get_p_at_n_in_m(data, n, k, ind) + + return correct_num / length diff --git a/paddlehub/finetune/optimization.py b/paddlehub/finetune/optimization.py deleted file mode 100644 index e5bdacfe154abb9a04e3bf5df859973813fac25e..0000000000000000000000000000000000000000 --- a/paddlehub/finetune/optimization.py +++ /dev/null @@ -1,119 +0,0 @@ -#coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Optimization and learning rate scheduling.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle.fluid as fluid -import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler -from paddle.fluid.layers import control_flow -from paddlehub.common.logger import logger - - -def adam_weight_decay_optimization(loss, - warmup_steps, - num_train_steps, - learning_rate, - main_program, - weight_decay, - scheduler='linear_decay'): - if scheduler == 'noam_decay': - if warmup_steps > 0: - scheduled_lr = fluid.layers.learning_rate_scheduler\ - .noam_decay(1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) - else: - logger.warning( - "Noam decay learning rate scheduler should have positive \ - warmup steps, using constant learning rate instead!") - - scheduled_lr = fluid.layers.create_global_var( - shape=[1], - value=learning_rate, - dtype='float32', - persistable=True, - name="learning_rate") - elif scheduler == 'linear_decay': - scheduled_lr = linear_warmup_decay(learning_rate, num_train_steps, - warmup_steps, main_program) - else: - raise ValueError("Unkown learning rate scheduler, should be " - "'noam_decay' or 'linear_decay'") - - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) - - clip_norm_thres = 1.0 - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) - - def exclude_from_weight_decay(name): - if name.find("layer_norm") > -1: - return True - bias_suffix = ["_bias", "_b", ".b_0"] - for suffix in bias_suffix: - if name.endswith(suffix): - return True - return False - - param_list = dict() - - for param in main_program.global_block().all_parameters(): - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True - - _, param_grads = optimizer.minimize(loss) - - if weight_decay > 0: - for param, grad in param_grads: - if exclude_from_weight_decay(param.name): - continue - with param.block.program._optimized_guard( - [param, grad]), fluid.framework.name_scope("weight_decay"): - updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr - fluid.layers.assign(output=param, input=updated_param) - - return scheduled_lr - - -def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps, - main_program): - with main_program._lr_schedule_guard(): - global_step = lr_scheduler._decay_step_counter() - - lr = fluid.layers.create_global_var( - shape=[1], - value=init_lr, - dtype='float32', - persistable=True, - name="learning_rate") - - with control_flow.Switch() as switch: - with switch.case(global_step < num_warmup_steps): - decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps - fluid.layers.assign(decayed_lr, lr) - with switch.default(): - decayed_lr = lr_scheduler.polynomial_decay( - learning_rate=init_lr, - decay_steps=num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False) - fluid.layers.assign(decayed_lr, lr) - - return lr diff --git a/paddlehub/finetune/strategy.py b/paddlehub/finetune/strategy.py index ed4fdff644a3aa519dda98e8089a42a0899b389c..0ba58cfd3dcf7c0bbc37951245adcd440c25b729 100644 --- a/paddlehub/finetune/strategy.py +++ b/paddlehub/finetune/strategy.py @@ -18,12 +18,15 @@ from __future__ import division from __future__ import print_function import os +import math import multiprocessing import paddle.fluid as fluid -from paddlehub.finetune.optimization import adam_weight_decay_optimization +from paddlehub.common.logger import logger from paddlehub.finetune.regularizer import L2SPDecayRegularizer +import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler +from paddle.fluid.layers import control_flow def get_pretrained_parameter(main_program, start_program): @@ -40,6 +43,99 @@ def get_pretrained_parameter(main_program, start_program): return pretrained_parameters +def get_parentOp_depth_max(parent_ops, op_depth_dict): + max_depth = 1 + for parent_op in parent_ops: + depth = op_depth_dict.get(parent_op, 1) + if max_depth < depth: + max_depth = depth + return max_depth + + +def get_opDepth_min(ops, op_depth_dict): + min_depth = max(op_depth_dict.values()) + for op in ops: + depth = op_depth_dict[op] + if min_depth > depth: + min_depth = depth + return min_depth + + +def get_depth_parameter(main_program): + global_block = main_program.global_block() + + var_op_dict = {} + for op in global_block.ops: + + for input_arg in op.input_arg_names: + if input_arg not in var_op_dict.keys(): + var_op_dict[input_arg] = {"output_ops": [], "input_ops": []} + var_op_dict[input_arg]["output_ops"].append(op) + + for output_arg in op.output_arg_names: + if output_arg not in var_op_dict.keys(): + var_op_dict[output_arg] = {"output_ops": [], "input_ops": []} + var_op_dict[output_arg]["input_ops"].append(op) + + op_depth_dict = {} + for op in global_block.ops: + parent_ops = [] + for input_arg in op.input_arg_names: + for parent_op in var_op_dict[input_arg]["input_ops"]: + if parent_op not in parent_ops: + parent_ops.append(parent_op) + if not parent_ops: + op_depth_dict[op] = 1 + else: + op_depth_dict[op] = get_parentOp_depth_max(parent_ops, + op_depth_dict) + 1 + + depth_params_dict = {} + updated_depth_params_dict = {} + for param in global_block.iter_parameters(): + adherent_ops = var_op_dict[param.name]["output_ops"] + depth = get_opDepth_min(adherent_ops, op_depth_dict) + if depth not in depth_params_dict.keys(): + depth_params_dict[depth] = [] + updated_depth_params_dict[depth] = [] + depth_params_dict[depth].append(param) + updated_depth_params_dict[depth].append(param) + + depth_list = sorted(depth_params_dict.keys()) + len_depth_list = len(depth_list) + for index, depth in enumerate(depth_list): + for param in depth_params_dict[depth]: + prefix = param.name.split(".")[0] + if index < len_depth_list - 1: + next_depth = depth_list[index + 1] + for param_next_depth in depth_params_dict[next_depth]: + prefix_next_depth = param_next_depth.name.split(".")[0] + if prefix == prefix_next_depth: + updated_depth_params_dict[depth].append( + param_next_depth) + updated_depth_params_dict[next_depth].remove( + param_next_depth) + + if not updated_depth_params_dict[next_depth]: + updated_depth_params_dict.pop(next_depth) + + return updated_depth_params_dict + + +def set_gradual_unfreeze(main_program, unfreeze_depths): + depth_params_dict = get_depth_parameter(main_program) + + for depth in unfreeze_depths: + for index, param in enumerate(depth_params_dict[depth]): + depth_params_dict[depth][index].stop_gradient = False + + freeze_depths = list( + set(depth_params_dict.keys()).difference(set(unfreeze_depths))) + for depth in freeze_depths: + for index, param in enumerate(depth_params_dict[depth]): + depth_params_dict[depth][index].stop_gradient = True + + class DefaultStrategy(object): def __init__(self, learning_rate=1e-4, optimizer_name="adam"): self.learning_rate = learning_rate @@ -75,133 +171,403 @@ class DefaultStrategy(object): self.optimizer = fluid.optimizer.Adam( learning_rate=self.learning_rate) - def execute(self, loss, data_reader, config): + def execute(self, loss, data_reader, config, dev_count): if self.optimizer is not None: self.optimizer.minimize(loss) else: raise ValueError("DefaultStrategy's optimizer is None") - # TODO complete __str__() def __str__(self): return "DefaultStrategy" + def step(self): + pass + -class AdamWeightDecayStrategy(DefaultStrategy): +class CombinedStrategy(DefaultStrategy): def __init__(self, + optimizer_name="adam", learning_rate=1e-4, - lr_scheduler="linear_decay", - warmup_proportion=0.1, - weight_decay=0.01, - optimizer_name="adam"): - super(AdamWeightDecayStrategy, self).__init__( - learning_rate=learning_rate, optimizer_name=optimizer_name) - # check strategy correctness - if lr_scheduler not in ["linear_decay", "noam_decay"]: - raise ValueError("lr_scheduler {} is not setup " - "correctly".format(lr_scheduler)) - self._lr_scheduler = lr_scheduler - self._warmup_proportion = warmup_proportion - self._weight_decay = weight_decay - - @property - def lr_scheduler(self): - return self._lr_scheduler - - @property - def warmup_proportion(self): - return self._warmup_proportion - - @property - def weight_decay(self): - return self._weight_decay - - def execute(self, loss, data_reader, config): - main_program = loss.block.program - # calculate wamrup step - dev_count = self._get_dev_count(config) + scheduler=None, + regularization=None, + clip=None): + super(CombinedStrategy, self).__init__( + optimizer_name=optimizer_name, learning_rate=learning_rate) + + # init set + self.scheduler = { + "warmup": 0.0, + "linear_decay": { + "start_point": 1.0, + "end_learning_rate": 0.0, + }, + "noam_decay": False, + "discriminative": { + "blocks": 0, + "factor": 2.6 + }, + "gradual_unfreeze": 0, + "slanted_triangle": { + "cut_fraction": 0.0, + "ratio": 32 + } + } + + self.regularization = { + "L2": 0.0, + "L2SP": 0.0, + "weight_decay": 0.0, + } + + self.clip = {"GlobalNorm": 0.0, "Norm": 0.0} + + if scheduler == None: + scheduler = {} + if regularization == None: + regularization = {} + if clip == None: + clip = {} + + # check legality and assign + for name in scheduler: + self.check_assign(self.scheduler, name, scheduler[name]) + for name in regularization: + self.check_assign(self.regularization, name, regularization[name]) + for name in clip: + self.check_assign(self.clip, name, clip[name]) + + self.epoch = 0 + self.main_program = None + + def check_assign(self, dictionary, key, value): + if key not in dictionary: + raise ValueError("Invalid parameter: %s" % key) + if isinstance(value, dict) and isinstance(dictionary[key], dict): + sub_dict = dictionary[key] + for sub_name in value: + self.check_assign(sub_dict, sub_name, value[sub_name]) + elif isinstance(dictionary[key], + type(value)) or (isinstance(dictionary[key], float) + and isinstance(value, (float, int))): + dictionary[key] = value + else: + if isinstance(dictionary[key], dict): + raise ValueError( + "The type of parameter %s should be a dict with keys: %s" % + (key, dictionary[key].keys())) + else: + raise ValueError("The type of parameter %s should be %s" % + (key, type(dictionary[key]))) + + def add_scheduler(self, name="warmup", value=0, **values): + if values: + self.check_assign(self.scheduler, name, values) + else: + self.check_assign(self.scheduler, name, value) + + def add_regularization(self, name="L2", value=1e-3, **values): + if values: + self.check_assign(self.regularization, name, values) + else: + self.check_assign(self.regularization, name, value) + + def add_clip(self, name="GlobalNorm", value=1.0, **values): + if values: + self.check_assign(self.clip, name, values) + else: + self.check_assign(self.clip, name, value) + + def scheduler_handler(self, max_train_steps): + scheduled_lr = fluid.layers.create_global_var( + shape=[1], + value=self.learning_rate, + dtype='float32', + persistable=True, + name="learning_rate") + + if not self.scheduler["slanted_triangle"]["cut_fraction"]: + warmup_steps = int(max_train_steps * self.scheduler["warmup"]) + linear_decay_start = int( + max_train_steps * self.scheduler["linear_decay"]["start_point"]) + if linear_decay_start < warmup_steps: + logger.warning( + "linear decay can not start during warmup process," + "it will start after warmup ends!") + linear_decay_start = warmup_steps + if self.scheduler["noam_decay"]: + if warmup_steps > 0: + scheduled_lr = fluid.layers.learning_rate_scheduler \ + .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)), + warmup_steps) + else: + logger.warning( + "Noam decay learning rate scheduler should have positive \ + warmup steps, using constant learning rate instead!") + + if not self.scheduler["noam_decay"] and \ + (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1): + with self.main_program._lr_schedule_guard(): + global_step = lr_scheduler._decay_step_counter() + with control_flow.Switch() as switch: + if warmup_steps > 0: + with switch.case(global_step < warmup_steps): + decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps + fluid.layers.assign(decayed_lr, scheduled_lr) + if self.scheduler["linear_decay"]["start_point"] < 1: + with switch.case(global_step >= linear_decay_start): + decayed_lr = lr_scheduler.polynomial_decay( + learning_rate=self.learning_rate, + decay_steps=max_train_steps, + end_learning_rate=self.scheduler[ + "linear_decay"]["end_learning_rate"], + power=1.0, + cycle=False) + fluid.layers.assign(decayed_lr, scheduled_lr) + else: + if self.scheduler["warmup"] or self.scheduler[ + "noam_decay"] or self.scheduler["linear_decay"][ + "start_point"] < 1: + logger.warning( + "You are using slanted_triangle learning rate " + "which will make warmup, noam_decay and linear_decay unable" + ) + cut_step = int(max_train_steps * + self.scheduler["slanted_triangle"]["cut_fraction"]) + ratio = self.scheduler["slanted_triangle"]["ratio"] + global_step = lr_scheduler._decay_step_counter() + with control_flow.Switch() as switch: + with switch.case(global_step <= cut_step): + pct = global_step / cut_step + decayed_lr = self.learning_rate * (1 + pct * + (ratio - 1)) / ratio + fluid.layers.assign(decayed_lr, scheduled_lr) + with switch.default(): + pct = 1 - (global_step - cut_step) / ( + max_train_steps - cut_step) + decayed_lr = self.learning_rate * (1 + pct * + (ratio - 1)) / ratio + fluid.layers.assign(decayed_lr, scheduled_lr) + + super(CombinedStrategy, self).__init__( + optimizer_name=self._optimizer_name, learning_rate=scheduled_lr) + + if self.scheduler["discriminative"]["blocks"]: + _block_layers = math.ceil( + len(self.sorted_depth) / + self.scheduler["discriminative"]["blocks"]) + power = 0 + for cnt, depth in enumerate(self.sorted_depth): + for index, param in enumerate(self.depth_params_dict[depth]): + param.optimize_attr["learning_rate"] *= \ + pow(1.0 / self.scheduler["discriminative"]["factor"], power) + if cnt and cnt % _block_layers == 0: + power += 1 + return scheduled_lr + + def clip_handler(self): + if self.clip["GlobalNorm"]: + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm( + clip_norm=self.clip["GlobalNorm"])) + elif self.clip["Norm"]: + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByNorm(clip_norm=self.clip["Norm"])) + + def regularization_handler(self, loss, scheduled_lr): + if self.regularization["L2"]: + for param in self.main_program.global_block().all_parameters(): + param.regularizer = fluid.regularizer.L2Decay( + regularization_coeff=self.regularization["L2"]) + + pretrained_params = get_pretrained_parameter( + self.main_program, fluid.default_startup_program()) + + if self.regularization["L2SP"]: + #TODO: L2SP can only run in one process now + for index, param in enumerate(pretrained_params): + param.regularizer = L2SPDecayRegularizer( + regularization_coeff=self.regularization["L2SP"]) + + _, param_grads = self.optimizer.minimize(loss) + + if self.regularization["weight_decay"]: + param_list = {} + for param in self.main_program.global_block().all_parameters(): + param_list[param.name] = param * 1.0 + param_list[param.name].stop_gradient = True + + for param, grad in param_grads: + if self.exclude_from_weight_decay(param.name): + continue + with param.block.program._optimized_guard( + [param, grad]), fluid.framework.name_scope("weight_decay"): + updated_param = param - param_list[ + param.name] * self.regularization[ + "weight_decay"] * scheduled_lr + fluid.layers.assign(output=param, input=updated_param) + + def execute(self, loss, data_reader, config, dev_count): + # base information + self.main_program = loss.block.program + self.config = config + + # self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator data_reader.data_generator( batch_size=config.batch_size, phase='train', shuffle=True) - data_reader.data_generator( - batch_size=config.batch_size, phase='val', shuffle=False) data_reader.data_generator( batch_size=config.batch_size, phase='dev', shuffle=False) - num_train_examples = data_reader.get_num_examples(phase='train') - max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count - warmup_steps = int(max_train_steps * self.warmup_proportion) - - scheduled_lr = adam_weight_decay_optimization( - loss, warmup_steps, max_train_steps, self.learning_rate, - main_program, self.weight_decay, self.lr_scheduler) + data_reader.data_generator( + batch_size=config.batch_size, phase='test', shuffle=False) + num_train_examples = len(data_reader.get_train_examples()) - return scheduled_lr + max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count - def _get_dev_count(self, config): - if config.use_cuda: - dev_count = fluid.core.get_cuda_device_count() + try: + # nlp_reader + _in_tokens = data_reader.in_tokens + if _in_tokens: + max_train_steps *= data_reader.max_seq_len + except: + # cv_reader without .in_tokens and .max_seq_len + pass + + if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[ + "gradual_unfreeze"] > 0: + self.depth_params_dict = get_depth_parameter(self.main_program) + self.sorted_depth = sorted( + self.depth_params_dict.keys(), reverse=True) + self.max_depth = len(self.sorted_depth) + + logger.info(self.__str__()) + # handle scheduler + scheduled_lr = self.scheduler_handler(max_train_steps) + + # handle clip + self.clip_handler() + + # handle regularization + self.regularization_handler(loss, scheduled_lr) + + return scheduled_lr, max_train_steps + + def exclude_from_weight_decay(self, name): + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + def step(self): + if self.scheduler["gradual_unfreeze"] > 0: + self.epoch += 1 + if self.max_depth > 0 and self.epoch <= self.scheduler[ + "gradual_unfreeze"]: + set_gradual_unfreeze( + self.main_program, + unfreeze_depths=self. + sorted_depth[:self.max_depth * self.epoch // + self.scheduler["gradual_unfreeze"]]) + else: + logger.warning( + "The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy." + % (self.max_depth)) else: - dev_count = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - - return dev_count + pass - # TODO complete __str__() def __str__(self): - return "AdamWeightDecayStrategy" + return "Strategy with sheduler: %s, regularization: %s and clip: %s" % ( + self.scheduler, self.regularization, self.clip) -class DefaultFinetuneStrategy(DefaultStrategy): +class AdamWeightDecayStrategy(CombinedStrategy): def __init__(self, learning_rate=1e-4, - optimizer_name="adam", - regularization_coeff=1e-3): - super(DefaultFinetuneStrategy, self).__init__( - learning_rate=learning_rate, optimizer_name=optimizer_name) - self.learning_rate = learning_rate - self._optimizer_name = optimizer_name - self.regularization_coeff = regularization_coeff - - def execute(self, loss, data_reader, config): - # get pretrained parameters - program = loss.block.program - global_block = program.global_block() - pretrained_params = get_pretrained_parameter( - program, fluid.default_startup_program()) - - # set parameter attrs - for index, param in enumerate(pretrained_params): - param.regularizer = fluid.regularizer.L2Decay( - regularization_coeff=self.regularization_coeff) - - if self.optimizer is not None: - self.optimizer.minimize(loss) + lr_scheduler="linear_decay", + warmup_proportion=0.1, + weight_decay=0.01, + optimizer_name="adam"): + scheduler = {"warmup": warmup_proportion} + if lr_scheduler == "noam_decay": + scheduler["noam_decay"] = True + elif lr_scheduler == "linear_decay": + scheduler["linear_decay"] = { + "start_point": warmup_proportion, + "end_learning_rate": 0, + } else: - raise ValueError("DefaultFinetuneStrategy's optimizer is None") + raise ValueError("lr_scheduler {} is not setup " + "correctly".format(lr_scheduler)) + regularization = {"weight_decay": weight_decay} + clip = {"GlobalNorm": 1.0} + super(AdamWeightDecayStrategy, self).__init__( + optimizer_name=optimizer_name, + learning_rate=learning_rate, + scheduler=scheduler, + regularization=regularization, + clip=clip) -class L2SPFinetuneStrategy(DefaultStrategy): +class L2SPFinetuneStrategy(CombinedStrategy): def __init__(self, learning_rate=1e-4, optimizer_name="adam", regularization_coeff=1e-3): + scheduler = {} + regularization = {"L2SP": regularization_coeff} + clip = {} super(L2SPFinetuneStrategy, self).__init__( - learning_rate=learning_rate, optimizer_name=optimizer_name) - self.learning_rate = learning_rate - self._optimizer_name = optimizer_name - self.regularization_coeff = regularization_coeff + optimizer_name=optimizer_name, + learning_rate=learning_rate, + scheduler=scheduler, + regularization=regularization, + clip=clip) - def execute(self, loss, data_reader, config): - # get pretrained parameters - program = loss.block.program - global_block = program.global_block() - pretrained_params = get_pretrained_parameter( - program, fluid.default_startup_program()) - # set parameter attrs - for index, param in enumerate(pretrained_params): - param.regularizer = L2SPDecayRegularizer( - regularization_coeff=self.regularization_coeff) +class DefaultFinetuneStrategy(CombinedStrategy): + def __init__(self, + learning_rate=1e-4, + optimizer_name="adam", + regularization_coeff=1e-3): + scheduler = {} + regularization = {"L2": regularization_coeff} + clip = {} + + super(DefaultFinetuneStrategy, self).__init__( + optimizer_name=optimizer_name, + learning_rate=learning_rate, + scheduler=scheduler, + regularization=regularization, + clip=clip) - if self.optimizer is not None: - self.optimizer.minimize(loss) - else: - raise ValueError("DefaultFinetuneStrategy's optimizer is None") + +class ULMFiTStrategy(CombinedStrategy): + def __init__(self, + learning_rate=1e-4, + optimizer_name="adam", + cut_fraction=0.1, + ratio=32, + dis_blocks=3, + factor=2.6, + frz_blocks=3): + + scheduler = { + "slanted_triangle": { + "cut_fraction": cut_fraction, + "ratio": ratio + }, + "gradual_unfreeze": frz_blocks, + "discriminative": { + "blocks": dis_blocks, + "factor": factor + } + } + regularization = {} + clip = {} + super(ULMFiTStrategy, self).__init__( + optimizer_name=optimizer_name, + learning_rate=learning_rate, + scheduler=scheduler, + regularization=regularization, + clip=clip) diff --git a/paddlehub/finetune/task/__init__.py b/paddlehub/finetune/task/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..864f276ced16095edfe6ef166121c5348cad9850 --- /dev/null +++ b/paddlehub/finetune/task/__init__.py @@ -0,0 +1,20 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .basic_task import BasicTask, RunEnv, RunState +from .classifier_task import ClassifierTask, ImageClassifierTask, TextClassifierTask, MultiLabelClassifierTask +from .reading_comprehension_task import ReadingComprehensionTask +from .regression_task import RegressionTask +from .sequence_task import SequenceLabelTask diff --git a/paddlehub/finetune/task.py b/paddlehub/finetune/task/basic_task.py similarity index 53% rename from paddlehub/finetune/task.py rename to paddlehub/finetune/task/basic_task.py index bef9727c04fe9e390177dae3023c2b1954c91ab2..2dec268d49b386f999f69e062fd4a5eef8c05c30 100644 --- a/paddlehub/finetune/task.py +++ b/paddlehub/finetune/task/basic_task.py @@ -1,5 +1,5 @@ -#coding:utf-8 -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. @@ -18,29 +18,19 @@ from __future__ import division from __future__ import print_function import os -import collections import contextlib import time -import multiprocessing import copy - -import numpy as np import paddle.fluid as fluid -from visualdl import LogWriter +from tb_paddle import SummaryWriter import paddlehub as hub from paddlehub.common.paddle_helper import dtype_map, clone_program from paddlehub.common.utils import mkdir, to_list from paddlehub.common.logger import logger from paddlehub.finetune.checkpoint import load_checkpoint, save_checkpoint -from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 from paddlehub.finetune.config import RunConfig -__all__ = [ - "ClassifierTask", "ImageClassifierTask", "TextClassifierTask", - "SequenceLabelTask", "MultiLabelClassifierTask" -] - class RunState(object): def __init__(self, length): @@ -92,11 +82,24 @@ class BasicTask(object): data_reader, main_program=None, startup_program=None, - config=None): + config=None, + metrics_choices="default"): # base item self._base_data_reader = data_reader self._base_feed_list = feed_list + + # metrics item + self.best_score = -999 + if metrics_choices == "default": + metrics_choices = ["acc"] + elif metrics_choices == None: + metrics_choices = [] + if isinstance(metrics_choices, list): + self.metrics_choices = metrics_choices + else: + self.metrics_choices = [metrics_choices] + if main_program is None: self._base_main_program = clone_program( fluid.default_main_program(), for_test=False) @@ -138,13 +141,16 @@ class BasicTask(object): if not os.path.exists(self.config.checkpoint_dir): mkdir(self.config.checkpoint_dir) vdl_log_dir = os.path.join(self.config.checkpoint_dir, "vdllog") - self.log_writer = LogWriter(vdl_log_dir, sync_cycle=1) + self.tb_writer = SummaryWriter(vdl_log_dir) # run environment self._phases = [] self._envs = {} self._predict_data = None + # accelerate predict + self.is_best_model_loaded = False + # set default phase self.enter_phase("train") @@ -164,9 +170,24 @@ class BasicTask(object): def init_if_necessary(self): if not self.is_checkpoint_loaded: - self.is_checkpoint_loaded = True if not self.load_checkpoint(): self.exe.run(self._base_startup_program) + self.is_checkpoint_loaded = True + self.is_best_model_loaded = False + + def init_if_load_best_model(self): + if not self.is_best_model_loaded: + best_model_path = os.path.join(self.config.checkpoint_dir, + "best_model") + logger.info("Load the best model from %s" % best_model_path) + if os.path.exists(best_model_path): + self.load_parameters(best_model_path) + self.is_checkpoint_loaded = False + self.is_best_model_loaded = True + else: + self.init_if_necessary() + else: + logger.info("The best model has been loaded") def _build_env(self): if self.env.is_inititalized: @@ -242,19 +263,16 @@ class BasicTask(object): with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): - self.config.strategy.execute( - self.loss, self._base_data_reader, self.config) + self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( + self.loss, self._base_data_reader, self.config, + self.device_count) if self.is_train_phase: loss_name = self.env.loss.name - share_vars_from = None else: loss_name = None - if self._base_compiled_program is None: - share_vars_from = None - else: - share_vars_from = self._base_compiled_program + share_vars_from = self._base_compiled_program if not self.config.use_data_parallel: if self.config.enable_memory_optim: @@ -267,9 +285,6 @@ class BasicTask(object): share_vars_from=share_vars_from, build_strategy=self.build_strategy) - if self._base_compiled_program is None: - self._base_compiled_program = self.env.main_program_compiled - self.exe.run(self.env.startup_program) self._build_env_end_event() @@ -348,6 +363,8 @@ class BasicTask(object): @property def main_program_to_be_run(self): if self.config.use_data_parallel: + if self._base_compiled_program is None: + self._base_compiled_program = self.env.main_program_compiled return self.main_program_compiled return self.main_program @@ -420,7 +437,8 @@ class BasicTask(object): pass def _build_env_end_event(self): - pass + if not self.is_predict_phase: + self.env.score_scalar = {} def _finetune_start_event(self): logger.info("PaddleHub finetune start") @@ -438,14 +456,61 @@ class BasicTask(object): logger.info("Evaluation on {} dataset start".format(self.phase)) def _eval_end_event(self, run_states): - run_speed = self._calculate_metrics(run_states) - logger.info("[%s dataset evaluation result] [step/sec: %.2f]" % - (self.phase, run_speed)) + eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states) + self.tb_writer.add_scalar( + tag=self.phase + "/Loss [{}]".format(self.phase), + scalar_value=eval_loss, + global_step=self.current_step) + + log_scores = "" + for metric in eval_scores: + self.tb_writer.add_scalar( + tag=self.phase + "/{} [{}]".format(metric, self.phase), + scalar_value=eval_scores[metric], + global_step=self.current_step) + + log_scores += "%s=%.5f " % (metric, eval_scores[metric]) + logger.info( + "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" % + (self.phase, eval_loss, log_scores, run_speed)) + + eval_scores_items = eval_scores.items() + if len(eval_scores_items): + # The first metric will be chose to eval + main_metric, main_value = list(eval_scores_items)[0] + else: + logger.warning( + "None of metrics has been implemented, loss will be used to evaluate." + ) + # The larger, the better + main_metric, main_value = "negative loss", -eval_loss + if self.phase in ["dev", "val"] and main_value > self.best_score: + self.best_score = main_value + model_saved_dir = os.path.join(self.config.checkpoint_dir, + "best_model") + logger.info("best model saved to %s [best %s=%.5f]" % + (model_saved_dir, main_metric, main_value)) + save_result = fluid.io.save_persistables( + executor=self.exe, + dirname=model_saved_dir, + main_program=self.main_program) def _log_interval_event(self, run_states): - run_speed = self._calculate_metrics(run_states) - logger.info( - "step %d: [step/sec: %.2f]" % (self.current_step, run_speed)) + scores, avg_loss, run_speed = self._calculate_metrics(run_states) + self.tb_writer.add_scalar( + tag=self.phase + "/Loss [{}]".format(self.phase), + scalar_value=avg_loss, + global_step=self.current_step) + log_scores = "" + for metric in scores: + self.tb_writer.add_scalar( + tag=self.phase + "/{} [{}]".format(metric, self.phase), + scalar_value=scores[metric], + global_step=self.current_step) + log_scores += "%s=%.5f " % (metric, scores[metric]) + logger.info("step %d / %d: loss=%.5f %s[step/sec: %.2f]" % + (self.current_step, self.max_train_steps, avg_loss, + log_scores, run_speed)) def _save_ckpt_interval_event(self): self.save_checkpoint() @@ -467,9 +532,14 @@ class BasicTask(object): raise NotImplementedError def _add_metrics(self): + # Some metrics like acc, auc can be calculated by fluid.layers + # The others can be calculated in _calculate_metrics function raise NotImplementedError def _calculate_metrics(self, run_states): + # NOTE: if you want to customize the metrics + # you should make sure that the first parameter returned is a dict + # The first key will be used as main metrics to update the best model raise NotImplementedError # NOTE: current saved checkpoint machanism is not completed, @@ -479,11 +549,12 @@ class BasicTask(object): checkpoint_dir=self.config.checkpoint_dir, current_epoch=self.current_epoch, global_step=self.current_step, + best_score=self.best_score, exe=self.exe, main_program=self.main_program) def load_checkpoint(self): - is_load_successful, self.env.current_epoch, self.env.current_step = load_checkpoint( + is_load_successful, self.env.current_epoch, self.env.current_step, self.best_score = load_checkpoint( self.config.checkpoint_dir, self.exe, main_program=self.main_program) @@ -513,24 +584,30 @@ class BasicTask(object): run_states = [] if self.current_epoch <= self.config.num_epoch: while self.current_epoch <= self.config.num_epoch: + self.config.strategy.step() run_states = self._run(do_eval=do_eval) self.env.current_epoch += 1 # Save checkpoint after finetune self.save_checkpoint() - # Final evaluation if self._base_data_reader.get_dev_examples() != []: self.eval(phase="dev") if self._base_data_reader.get_test_examples() != []: - self.eval(phase="test") + self.eval(phase="test", load_best_model=True) self._finetune_end_event(run_states) return run_states - def eval(self, phase="dev"): + def eval(self, phase="dev", load_best_model=False): + # Warning: DO NOT use eval(load_best_model=True) in finetune_and_eval + # It will cause trainer unable to continue training from checkpoint after eval + # More important, The model should evaluate current performance during training. with self.phase_guard(phase=phase): - self.init_if_necessary() + if load_best_model: + self.init_if_load_best_model() + else: + self.init_if_necessary() self._eval_start_event() run_states = self._run() self._eval_end_event(run_states) @@ -538,11 +615,10 @@ class BasicTask(object): def predict(self, data, load_best_model=True): with self.phase_guard(phase="predict"): - self.init_if_necessary() if load_best_model: - best_model_path = os.path.join(self.config.checkpoint_dir, - "best_model") - self.load_parameters(best_model_path) + self.init_if_load_best_model() + else: + self.init_if_necessary() self._predict_data = data self._predict_start_event() run_states = self._run() @@ -567,7 +643,6 @@ class BasicTask(object): for run_step, batch in enumerate(self.reader(), start=1): if self.config.use_data_parallel and len(batch) < self.device_count: continue - step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 num_batch_examples = len(batch) @@ -652,460 +727,3 @@ class BasicTask(object): break return global_run_states - - -class ClassifierTask(BasicTask): - def __init__(self, - feature, - num_classes, - feed_list, - data_reader, - startup_program=None, - config=None, - hidden_units=None): - - main_program = feature.block.program - - super(ClassifierTask, self).__init__( - data_reader=data_reader, - main_program=main_program, - feed_list=feed_list, - startup_program=startup_program, - config=config) - - self.feature = feature - self.num_classes = num_classes - self.hidden_units = hidden_units - self.best_accuracy = -1 - - def _build_net(self): - cls_feats = self.feature - if self.hidden_units is not None: - for n_hidden in self.hidden_units: - cls_feats = fluid.layers.fc( - input=cls_feats, size=n_hidden, act="relu") - - logits = fluid.layers.fc( - input=cls_feats, - size=self.num_classes, - param_attr=fluid.ParamAttr( - name="cls_out_w", - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name="cls_out_b", initializer=fluid.initializer.Constant(0.)), - act="softmax") - - return [logits] - - def _add_label(self): - return [fluid.layers.data(name="label", dtype="int64", shape=[1])] - - def _add_loss(self): - ce_loss = fluid.layers.cross_entropy( - input=self.outputs[0], label=self.labels[0]) - return fluid.layers.mean(x=ce_loss) - - def _add_metrics(self): - return [ - fluid.layers.accuracy(input=self.outputs[0], label=self.labels[0]) - ] - - def _build_env_end_event(self): - with self.log_writer.mode(self.phase) as logw: - if not self.is_predict_phase: - self.env.loss_scalar = logw.scalar( - tag="Loss [{}]".format(self.phase)) - self.env.acc_scalar = logw.scalar( - tag="Accuracy [{}]".format(self.phase)) - - def _calculate_metrics(self, run_states): - loss_sum = acc_sum = run_examples = 0 - run_step = run_time_used = 0 - for run_state in run_states: - run_examples += run_state.run_examples - run_step += run_state.run_step - loss_sum += np.mean( - run_state.run_results[-1]) * run_state.run_examples - acc_sum += np.mean( - run_state.run_results[0]) * run_state.run_examples - - run_time_used = time.time() - run_states[0].run_time_begin - avg_loss = loss_sum / run_examples - avg_acc = acc_sum / run_examples - run_speed = run_step / run_time_used - - return avg_loss, avg_acc, run_speed - - def _log_interval_event(self, run_states): - avg_loss, avg_acc, run_speed = self._calculate_metrics(run_states) - self.env.loss_scalar.add_record(self.current_step, avg_loss) - self.env.acc_scalar.add_record(self.current_step, avg_acc) - logger.info("step %d: loss=%.5f acc=%.5f [step/sec: %.2f]" % - (self.current_step, avg_loss, avg_acc, run_speed)) - - def _eval_end_event(self, run_states): - eval_loss, eval_acc, run_speed = self._calculate_metrics(run_states) - logger.info( - "[%s dataset evaluation result] loss=%.5f acc=%.5f [step/sec: %.2f]" - % (self.phase, eval_loss, eval_acc, run_speed)) - self.env.loss_scalar.add_record(self.current_step, eval_loss) - self.env.acc_scalar.add_record(self.current_step, eval_acc) - if self.phase in ["dev", "val"] and eval_acc > self.best_accuracy: - self.best_accuracy = eval_acc - model_saved_dir = os.path.join(self.config.checkpoint_dir, - "best_model") - logger.info("best model saved to %s [best accuracy=%.5f]" % - (model_saved_dir, self.best_accuracy)) - save_result = fluid.io.save_persistables( - executor=self.exe, - dirname=model_saved_dir, - main_program=self.main_program) - - -ImageClassifierTask = ClassifierTask - - -class TextClassifierTask(ClassifierTask): - def __init__(self, - feature, - num_classes, - feed_list, - data_reader, - startup_program=None, - config=None, - hidden_units=None): - - main_program = feature.block.program - - super(TextClassifierTask, self).__init__( - data_reader=data_reader, - feature=feature, - num_classes=num_classes, - feed_list=feed_list, - startup_program=startup_program, - config=config, - hidden_units=hidden_units) - - def _build_net(self): - cls_feats = fluid.layers.dropout( - x=self.feature, - dropout_prob=0.1, - dropout_implementation="upscale_in_train") - - if self.hidden_units is not None: - for n_hidden in self.hidden_units: - cls_feats = fluid.layers.fc( - input=cls_feats, size=n_hidden, act="relu") - - logits = fluid.layers.fc( - input=cls_feats, - size=self.num_classes, - param_attr=fluid.ParamAttr( - name="cls_out_w", - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name="cls_out_b", initializer=fluid.initializer.Constant(0.)), - act="softmax") - - return [logits] - - -class SequenceLabelTask(BasicTask): - def __init__( - self, - feature, - max_seq_len, - num_classes, - feed_list, - data_reader, - startup_program=None, - config=None, - ): - - main_program = feature.block.program - - super(SequenceLabelTask, self).__init__( - data_reader=data_reader, - main_program=main_program, - feed_list=feed_list, - startup_program=startup_program, - config=config) - - self.feature = feature - self.max_seq_len = max_seq_len - self.num_classes = num_classes - self.best_f1 = -1 - - def _build_net(self): - self.logits = fluid.layers.fc( - input=self.feature, - size=self.num_classes, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name="cls_seq_label_out_w", - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name="cls_seq_label_out_b", - initializer=fluid.initializer.Constant(0.))) - - self.ret_infers = fluid.layers.reshape( - x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) - ret_infers = fluid.layers.assign(self.ret_infers) - - self.seq_len = fluid.layers.data( - name="seq_len", shape=[1], dtype='int64') - seq_len = fluid.layers.assign(self.seq_len) - - logits = self.logits - logits = fluid.layers.flatten(logits, axis=2) - logits = fluid.layers.softmax(logits) - self.num_labels = logits.shape[1] - return [logits] - - def _add_label(self): - label = fluid.layers.data( - name="label", shape=[self.max_seq_len, 1], dtype='int64') - return [label] - - def _add_loss(self): - labels = fluid.layers.flatten(self.labels[0], axis=2) - ce_loss = fluid.layers.cross_entropy( - input=self.outputs[0], label=labels) - loss = fluid.layers.mean(x=ce_loss) - return loss - - def _add_metrics(self): - self.ret_labels = fluid.layers.reshape(x=self.labels[0], shape=[-1, 1]) - return [self.ret_labels, self.ret_infers, self.seq_len] - - def _build_env_end_event(self): - with self.log_writer.mode(self.phase) as logw: - if self.is_train_phase: - self.env.loss_scalar = logw.scalar( - tag="Loss [{}]".format(self.phase)) - - if self.phase in ["dev", "val"]: - self.env.loss_scalar = logw.scalar( - tag="Loss [{}]".format(self.phase)) - self.env.f1_scalar = logw.scalar( - tag="F1 [{}]".format(self.phase)) - self.env.precision_scalar = logw.scalar( - tag="Precision [{}]".format(self.phase)) - self.env.recall_scalar = logw.scalar( - tag="Recall [{}]".format(self.phase)) - - def _calculate_metrics(self, run_states): - total_infer = total_label = total_correct = loss_sum = 0 - run_step = run_time_used = run_examples = 0 - for run_state in run_states: - loss_sum += np.mean(run_state.run_results[-1]) - np_labels = run_state.run_results[0] - np_infers = run_state.run_results[1] - np_lens = run_state.run_results[2] - label_num, infer_num, correct_num = chunk_eval( - np_labels, np_infers, np_lens, self.num_labels, - self.device_count) - total_infer += infer_num - total_label += label_num - total_correct += correct_num - run_examples += run_state.run_examples - run_step += run_state.run_step - - run_time_used = time.time() - run_states[0].run_time_begin - run_speed = run_step / run_time_used - avg_loss = loss_sum / run_examples - precision, recall, f1 = calculate_f1(total_label, total_infer, - total_correct) - return precision, recall, f1, avg_loss, run_speed - - def _log_interval_event(self, run_states): - precision, recall, f1, avg_loss, run_speed = self._calculate_metrics( - run_states) - self.env.loss_scalar.add_record(self.current_step, avg_loss) - logger.info("step %d: loss=%.5f [step/sec: %.2f]" % - (self.current_step, avg_loss, run_speed)) - - def _eval_end_event(self, run_states): - precision, recall, f1, avg_loss, run_speed = self._calculate_metrics( - run_states) - self.env.loss_scalar.add_record(self.current_step, avg_loss) - self.env.f1_scalar.add_record(self.current_step, f1) - self.env.precision_scalar.add_record(self.current_step, precision) - self.env.recall_scalar.add_record(self.current_step, recall) - logger.info("[%s dataset evaluation result] [step/sec: %.2f]" % - (self.phase, run_speed)) - logger.info( - "[%s evaluation] F1-Score=%f, precision=%f, recall=%f [step/sec: %.2f]" - % (self.phase, f1, precision, recall, run_speed)) - if self.phase in ["dev", "val"] and f1 > self.best_f1: - self.best_f1 = f1 - model_saved_dir = os.path.join(self.config.checkpoint_dir, - "best_model") - logger.info("best model saved to %s [best F1=%.5f]" % - (model_saved_dir, self.best_f1)) - fluid.io.save_persistables(self.exe, dirname=model_saved_dir) - - @property - def feed_list(self): - feed_list = [varname for varname in self._base_feed_list] - if self.is_train_phase or self.is_test_phase: - feed_list += [self.labels[0].name, self.seq_len.name] - else: - feed_list += [self.seq_len.name] - return feed_list - - @property - def fetch_list(self): - if self.is_train_phase or self.is_test_phase: - return [metric.name for metric in self.metrics] + [self.loss.name] - elif self.is_predict_phase: - return [self.ret_infers.name] + [self.seq_len.name] - return [output.name for output in self.outputs] - - -class MultiLabelClassifierTask(ClassifierTask): - def __init__(self, - feature, - num_classes, - feed_list, - data_reader, - startup_program=None, - config=None, - hidden_units=None): - - main_program = feature.block.program - - super(MultiLabelClassifierTask, self).__init__( - data_reader=data_reader, - feature=feature, - num_classes=num_classes, - feed_list=feed_list, - startup_program=startup_program, - config=config, - hidden_units=hidden_units) - - self.best_avg_auc = -1 - - def _build_net(self): - cls_feats = fluid.layers.dropout( - x=self.feature, - dropout_prob=0.1, - dropout_implementation="upscale_in_train") - - if self.hidden_units is not None: - for n_hidden in self.hidden_units: - cls_feats = fluid.layers.fc( - input=cls_feats, size=n_hidden, act="relu") - - probs = [] - for i in range(self.num_classes): - probs.append( - fluid.layers.fc( - input=cls_feats, - size=2, - param_attr=fluid.ParamAttr( - name="cls_out_w_%d" % i, - initializer=fluid.initializer.TruncatedNormal( - scale=0.02)), - bias_attr=fluid.ParamAttr( - name="cls_out_b_%d" % i, - initializer=fluid.initializer.Constant(0.)), - act="softmax")) - - return probs - - def _add_label(self): - label = fluid.layers.data( - name="label", shape=[self.num_classes], dtype='int64') - return [label] - - def _add_loss(self): - label_split = fluid.layers.split( - self.labels[0], self.num_classes, dim=-1) - total_loss = fluid.layers.fill_constant( - shape=[1], value=0.0, dtype='float64') - for index, probs in enumerate(self.outputs): - ce_loss = fluid.layers.cross_entropy( - input=probs, label=label_split[index]) - total_loss += fluid.layers.reduce_sum(ce_loss) - loss = fluid.layers.mean(x=total_loss) - return loss - - def _add_metrics(self): - label_split = fluid.layers.split( - self.labels[0], self.num_classes, dim=-1) - # metrics change to auc of every class - eval_list = [] - for index, probs in enumerate(self.outputs): - current_auc, _, _ = fluid.layers.auc( - input=probs, label=label_split[index]) - eval_list.append(current_auc) - return eval_list - - def _build_env_end_event(self): - with self.log_writer.mode(self.phase) as logw: - if not self.is_predict_phase: - self.env.loss_scalar = logw.scalar( - tag="Loss [{}]".format(self.phase)) - if self.is_train_phase: - self.env.auc_scalar_list = [] - for i in range(self.num_classes): - self.env.auc_scalar_list.append( - logw.scalar(tag="AUC_{} [{}]".format(i, "train"))) - self.env.avg_auc_scalar = logw.scalar( - tag="Average auc [{}]".format(self.phase)) - - def _calculate_metrics(self, run_states): - loss_sum = acc_sum = run_examples = 0 - run_step = run_time_used = 0 - for run_state in run_states: - run_examples += run_state.run_examples - run_step += run_state.run_step - loss_sum += np.mean( - run_state.run_results[-1]) * run_state.run_examples - auc_list = run_states[-1].run_results[:-1] - - run_time_used = time.time() - run_states[0].run_time_begin - avg_loss = loss_sum / (run_examples * self.num_classes) - run_speed = run_step / run_time_used - - return avg_loss, auc_list, run_speed - - def _log_interval_event(self, run_states): - avg_loss, auc_list, run_speed = self._calculate_metrics(run_states) - - self.env.loss_scalar.add_record(self.current_step, avg_loss) - avg_auc = np.mean(auc_list) - self.env.avg_auc_scalar.add_record(self.current_step, avg_auc) - logger.info("step %d: loss=%.5f avg_auc=%.5f [step/sec: %.2f]" % - (self.current_step, avg_loss, avg_auc, run_speed)) - for index, auc_scalar in enumerate(self.env.auc_scalar_list): - auc_scalar.add_record(self.current_step, auc_list[index][0]) - logger.info("label_%d_auc = %.5f" % (index, auc_list[index][0])) - - def _eval_end_event(self, run_states): - eval_loss, auc_list, run_speed = self._calculate_metrics(run_states) - avg_auc = np.mean(auc_list) - logger.info( - "[%s dataset evaluation result] loss=%.5f avg_auc=%.5f [step/sec: %.2f]" - % (self.phase, eval_loss, avg_auc, run_speed)) - for index, auc in enumerate(auc_list): - logger.info("label_%d_auc = %.5f" % (index, auc_list[index][0])) - self.env.loss_scalar.add_record(self.current_step, eval_loss) - self.env.avg_auc_scalar.add_record(self.current_step, avg_auc) - if self.phase in ["dev", "val"] and avg_auc > self.best_avg_auc: - self.best_avg_auc = avg_auc - model_saved_dir = os.path.join(self.config.checkpoint_dir, - "best_model") - logger.info("best model saved to %s [best average auc=%.5f]" % - (model_saved_dir, self.best_avg_auc)) - save_result = fluid.io.save_persistables( - executor=self.exe, - dirname=model_saved_dir, - main_program=self.main_program) - - @property - def fetch_list(self): - if self.is_train_phase or self.is_test_phase: - return [metric.name for metric in self.metrics] + [self.loss.name] - return self.outputs diff --git a/paddlehub/finetune/task/classifier_task.py b/paddlehub/finetune/task/classifier_task.py new file mode 100644 index 0000000000000000000000000000000000000000..5dbdc917e90bb9ce09e9c0af3b7c31873213f668 --- /dev/null +++ b/paddlehub/finetune/task/classifier_task.py @@ -0,0 +1,303 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +from collections import OrderedDict +import numpy as np +import paddle.fluid as fluid + +from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef +from .basic_task import BasicTask + + +class ClassifierTask(BasicTask): + def __init__(self, + feature, + num_classes, + feed_list, + data_reader, + startup_program=None, + config=None, + hidden_units=None, + metrics_choices="default"): + if metrics_choices == "default": + metrics_choices = ["acc"] + + main_program = feature.block.program + super(ClassifierTask, self).__init__( + data_reader=data_reader, + main_program=main_program, + feed_list=feed_list, + startup_program=startup_program, + config=config, + metrics_choices=metrics_choices) + + self.feature = feature + self.num_classes = num_classes + self.hidden_units = hidden_units + + def _build_net(self): + cls_feats = self.feature + if self.hidden_units is not None: + for n_hidden in self.hidden_units: + cls_feats = fluid.layers.fc( + input=cls_feats, size=n_hidden, act="relu") + + logits = fluid.layers.fc( + input=cls_feats, + size=self.num_classes, + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.)), + act="softmax") + + self.ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1]) + + return [logits] + + def _add_label(self): + return [fluid.layers.data(name="label", dtype="int64", shape=[1])] + + def _add_loss(self): + ce_loss = fluid.layers.cross_entropy( + input=self.outputs[0], label=self.labels[0]) + return fluid.layers.mean(x=ce_loss) + + def _add_metrics(self): + acc = fluid.layers.accuracy(input=self.outputs[0], label=self.labels[0]) + return [acc] + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [self.labels[0].name, self.ret_infers.name + ] + [metric.name + for metric in self.metrics] + [self.loss.name] + return [output.name for output in self.outputs] + + def _calculate_metrics(self, run_states): + loss_sum = acc_sum = run_examples = 0 + run_step = run_time_used = 0 + all_labels = np.array([]) + all_infers = np.array([]) + + for run_state in run_states: + run_examples += run_state.run_examples + run_step += run_state.run_step + loss_sum += np.mean( + run_state.run_results[-1]) * run_state.run_examples + acc_sum += np.mean( + run_state.run_results[2]) * run_state.run_examples + np_labels = run_state.run_results[0] + np_infers = run_state.run_results[1] + all_labels = np.hstack((all_labels, np_labels.reshape([-1]))) + all_infers = np.hstack((all_infers, np_infers.reshape([-1]))) + + run_time_used = time.time() - run_states[0].run_time_begin + avg_loss = loss_sum / run_examples + run_speed = run_step / run_time_used + + # The first key will be used as main metrics to update the best model + scores = OrderedDict() + + for metric in self.metrics_choices: + if metric == "acc": + avg_acc = acc_sum / run_examples + scores["acc"] = avg_acc + elif metric == "f1": + f1 = calculate_f1_np(all_infers, all_labels) + scores["f1"] = f1 + elif metric == "matthews": + matthews = matthews_corrcoef(all_infers, all_labels) + scores["matthews"] = matthews + else: + raise ValueError("Not Support Metric: \"%s\"" % metric) + + return scores, avg_loss, run_speed + + +ImageClassifierTask = ClassifierTask + + +class TextClassifierTask(ClassifierTask): + def __init__(self, + feature, + num_classes, + feed_list, + data_reader, + startup_program=None, + config=None, + hidden_units=None, + metrics_choices="default"): + + if metrics_choices == "default": + metrics_choices = ["acc"] + super(TextClassifierTask, self).__init__( + data_reader=data_reader, + feature=feature, + num_classes=num_classes, + feed_list=feed_list, + startup_program=startup_program, + config=config, + hidden_units=hidden_units, + metrics_choices=metrics_choices) + + def _build_net(self): + cls_feats = fluid.layers.dropout( + x=self.feature, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + + if self.hidden_units is not None: + for n_hidden in self.hidden_units: + cls_feats = fluid.layers.fc( + input=cls_feats, size=n_hidden, act="relu") + + logits = fluid.layers.fc( + input=cls_feats, + size=self.num_classes, + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.)), + act="softmax") + + self.ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1]) + + return [logits] + + +class MultiLabelClassifierTask(ClassifierTask): + def __init__(self, + feature, + num_classes, + feed_list, + data_reader, + startup_program=None, + config=None, + hidden_units=None, + metrics_choices="default"): + if metrics_choices == "default": + metrics_choices = ["auc"] + + main_program = feature.block.program + super(MultiLabelClassifierTask, self).__init__( + data_reader=data_reader, + feature=feature, + num_classes=num_classes, + feed_list=feed_list, + startup_program=startup_program, + config=config, + hidden_units=hidden_units, + metrics_choices=metrics_choices) + self.class_name = list(data_reader.label_map.keys()) + + def _build_net(self): + cls_feats = fluid.layers.dropout( + x=self.feature, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + + if self.hidden_units is not None: + for n_hidden in self.hidden_units: + cls_feats = fluid.layers.fc( + input=cls_feats, size=n_hidden, act="relu") + + probs = [] + for i in range(self.num_classes): + probs.append( + fluid.layers.fc( + input=cls_feats, + size=2, + param_attr=fluid.ParamAttr( + name="cls_out_w_%d" % i, + initializer=fluid.initializer.TruncatedNormal( + scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b_%d" % i, + initializer=fluid.initializer.Constant(0.)), + act="softmax")) + + return probs + + def _add_label(self): + label = fluid.layers.data( + name="label", shape=[self.num_classes], dtype='int64') + return [label] + + def _add_loss(self): + label_split = fluid.layers.split( + self.labels[0], self.num_classes, dim=-1) + total_loss = fluid.layers.fill_constant( + shape=[1], value=0.0, dtype='float64') + for index, probs in enumerate(self.outputs): + ce_loss = fluid.layers.cross_entropy( + input=probs, label=label_split[index]) + total_loss += fluid.layers.reduce_sum(ce_loss) + loss = fluid.layers.mean(x=total_loss) + return loss + + def _add_metrics(self): + label_split = fluid.layers.split( + self.labels[0], self.num_classes, dim=-1) + # metrics change to auc of every class + eval_list = [] + for index, probs in enumerate(self.outputs): + current_auc, _, _ = fluid.layers.auc( + input=probs, label=label_split[index]) + eval_list.append(current_auc) + return eval_list + + def _calculate_metrics(self, run_states): + loss_sum = acc_sum = run_examples = 0 + run_step = run_time_used = 0 + for run_state in run_states: + run_examples += run_state.run_examples + run_step += run_state.run_step + loss_sum += np.mean( + run_state.run_results[-1]) * run_state.run_examples + auc_list = run_states[-1].run_results[:-1] + + run_time_used = time.time() - run_states[0].run_time_begin + avg_loss = loss_sum / (run_examples * self.num_classes) + run_speed = run_step / run_time_used + + # The first key will be used as main metrics to update the best model + scores = OrderedDict() + for metric in self.metrics_choices: + if metric == "auc": + scores["auc"] = np.mean(auc_list) + # NOTE: for MultiLabelClassifierTask, the metrics will be used to evaluate all the label + # and their mean value will also be reported. + for index, auc in enumerate(auc_list): + scores["auc_" + self.class_name[index]] = auc_list[index][0] + else: + raise ValueError("Not Support Metric: \"%s\"" % metric) + return scores, avg_loss, run_speed + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [metric.name for metric in self.metrics] + [self.loss.name] + return self.outputs diff --git a/paddlehub/finetune/task/reading_comprehension_task.py b/paddlehub/finetune/task/reading_comprehension_task.py new file mode 100644 index 0000000000000000000000000000000000000000..e040ef721693ab60bd686ddcfe29592f5c1b4a13 --- /dev/null +++ b/paddlehub/finetune/task/reading_comprehension_task.py @@ -0,0 +1,136 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +from collections import OrderedDict + +import numpy as np +import paddle.fluid as fluid +from .basic_task import BasicTask + + +class ReadingComprehensionTask(BasicTask): + def __init__(self, + feature, + feed_list, + data_reader, + startup_program=None, + config=None, + metrics_choices=None): + + main_program = feature.block.program + super(ReadingComprehensionTask, self).__init__( + data_reader=data_reader, + main_program=main_program, + feed_list=feed_list, + startup_program=startup_program, + config=config, + metrics_choices=metrics_choices) + self.feature = feature + + def _build_net(self): + if self.is_predict_phase: + self.unique_id = fluid.layers.data( + name="start_positions", + shape=[-1, 1], + lod_level=0, + dtype="int64") + + logits = fluid.layers.fc( + input=self.feature, + size=2, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_seq_label_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_seq_label_out_b", + initializer=fluid.initializer.Constant(0.))) + + logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) + start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) + + batch_ones = fluid.layers.fill_constant_batch_size_like( + input=start_logits, dtype='int64', shape=[1], value=1) + num_seqs = fluid.layers.reduce_sum(input=batch_ones) + + return [start_logits, end_logits, num_seqs] + + def _add_label(self): + start_positions = fluid.layers.data( + name="start_positions", shape=[-1, 1], lod_level=0, dtype="int64") + end_positions = fluid.layers.data( + name="end_positions", shape=[-1, 1], lod_level=0, dtype="int64") + return [start_positions, end_positions] + + def _add_loss(self): + start_positions = self.labels[0] + end_positions = self.labels[1] + + start_logits = self.outputs[0] + end_logits = self.outputs[1] + + start_loss = fluid.layers.softmax_with_cross_entropy( + logits=start_logits, label=start_positions) + start_loss = fluid.layers.mean(x=start_loss) + end_loss = fluid.layers.softmax_with_cross_entropy( + logits=end_logits, label=end_positions) + end_loss = fluid.layers.mean(x=end_loss) + total_loss = (start_loss + end_loss) / 2.0 + return total_loss + + def _add_metrics(self): + return [] + + @property + def feed_list(self): + feed_list = [varname for varname in self._base_feed_list] + if self.is_train_phase: + feed_list += [self.labels[0].name, self.labels[1].name] + elif self.is_predict_phase: + feed_list += [self.unique_id.name] + return feed_list + + @property + def fetch_list(self): + if self.is_train_phase: + return [metric.name for metric in self.metrics + ] + [self.loss.name, self.outputs[-1].name] + elif self.is_predict_phase: + return [self.unique_id.name + ] + [output.name for output in self.outputs] + + def _calculate_metrics(self, run_states): + total_cost, total_num_seqs = [], [] + run_step = run_time_used = run_examples = 0 + for run_state in run_states: + np_loss = run_state.run_results[0] + np_num_seqs = run_state.run_results[1] + total_cost.extend(np_loss * np_num_seqs) + total_num_seqs.extend(np_num_seqs) + run_examples += run_state.run_examples + run_step += run_state.run_step + + run_time_used = time.time() - run_states[0].run_time_begin + run_speed = run_step / run_time_used + avg_loss = np.sum(total_cost) / np.sum(total_num_seqs) + + scores = OrderedDict() + # If none of metrics has been implemented, loss will be used to evaluate. + return scores, avg_loss, run_speed diff --git a/paddlehub/finetune/task/regression_task.py b/paddlehub/finetune/task/regression_task.py new file mode 100644 index 0000000000000000000000000000000000000000..90ab388282851d68f0ce4c0e05624598555cd1d5 --- /dev/null +++ b/paddlehub/finetune/task/regression_task.py @@ -0,0 +1,122 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +from collections import OrderedDict + +import numpy as np +import paddle.fluid as fluid +from scipy.stats import spearmanr +from .basic_task import BasicTask + + +class RegressionTask(BasicTask): + def __init__(self, + feature, + feed_list, + data_reader, + startup_program=None, + config=None, + hidden_units=None, + metrics_choices="default"): + if metrics_choices == "default": + metrics_choices = ["spearman"] + + main_program = feature.block.program + super(RegressionTask, self).__init__( + data_reader=data_reader, + main_program=main_program, + feed_list=feed_list, + startup_program=startup_program, + config=config, + metrics_choices=metrics_choices) + self.feature = feature + self.hidden_units = hidden_units + + def _build_net(self): + cls_feats = fluid.layers.dropout( + x=self.feature, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + + if self.hidden_units is not None: + for n_hidden in self.hidden_units: + cls_feats = fluid.layers.fc( + input=cls_feats, size=n_hidden, act="relu") + + logits = fluid.layers.fc( + input=cls_feats, + size=1, + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.)), + act=None) + + return [logits] + + def _add_label(self): + return [fluid.layers.data(name="label", dtype="float32", shape=[1])] + + def _add_loss(self): + cost = fluid.layers.square_error_cost( + input=self.outputs[0], label=self.labels[0]) + return fluid.layers.mean(x=cost) + + def _add_metrics(self): + return [] + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [self.labels[0].name, self.outputs[0].name + ] + [metric.name + for metric in self.metrics] + [self.loss.name] + return [output.name for output in self.outputs] + + def _calculate_metrics(self, run_states): + loss_sum = run_examples = 0 + run_step = run_time_used = 0 + all_labels = np.array([]) + all_infers = np.array([]) + for run_state in run_states: + run_examples += run_state.run_examples + run_step += run_state.run_step + loss_sum += np.mean( + run_state.run_results[-1]) * run_state.run_examples + np_labels = run_state.run_results[0] + np_infers = run_state.run_results[1] + all_labels = np.hstack((all_labels, np_labels.reshape([-1]))) + all_infers = np.hstack((all_infers, np_infers.reshape([-1]))) + + run_time_used = time.time() - run_states[0].run_time_begin + avg_loss = loss_sum / run_examples + run_speed = run_step / run_time_used + + # The first key will be used as main metrics to update the best model + scores = OrderedDict() + + for metric in self.metrics_choices: + if metric == "spearman": + spearman_correlations = spearmanr(all_labels, all_infers)[0] + scores["spearman"] = spearman_correlations + else: + raise ValueError("Not Support Metric: \"%s\"" % metric) + return scores, avg_loss, run_speed diff --git a/paddlehub/finetune/task/sequence_task.py b/paddlehub/finetune/task/sequence_task.py new file mode 100644 index 0000000000000000000000000000000000000000..3116310a61b3928767434ecbfd82210b46ea940f --- /dev/null +++ b/paddlehub/finetune/task/sequence_task.py @@ -0,0 +1,148 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time +from collections import OrderedDict +import numpy as np +import paddle.fluid as fluid +from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 +from .basic_task import BasicTask + + +class SequenceLabelTask(BasicTask): + def __init__(self, + feature, + max_seq_len, + num_classes, + feed_list, + data_reader, + startup_program=None, + config=None, + metrics_choices="default"): + if metrics_choices == "default": + metrics_choices = ["f1", "precision", "recall"] + + main_program = feature.block.program + super(SequenceLabelTask, self).__init__( + data_reader=data_reader, + main_program=main_program, + feed_list=feed_list, + startup_program=startup_program, + config=config, + metrics_choices=metrics_choices) + self.feature = feature + self.max_seq_len = max_seq_len + self.num_classes = num_classes + + def _build_net(self): + self.logits = fluid.layers.fc( + input=self.feature, + size=self.num_classes, + num_flatten_dims=2, + param_attr=fluid.ParamAttr( + name="cls_seq_label_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_seq_label_out_b", + initializer=fluid.initializer.Constant(0.))) + + self.ret_infers = fluid.layers.reshape( + x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) + ret_infers = fluid.layers.assign(self.ret_infers) + + self.seq_len = fluid.layers.data( + name="seq_len", shape=[1], dtype='int64') + seq_len = fluid.layers.assign(self.seq_len) + + logits = self.logits + logits = fluid.layers.flatten(logits, axis=2) + logits = fluid.layers.softmax(logits) + self.num_labels = logits.shape[1] + return [logits] + + def _add_label(self): + label = fluid.layers.data( + name="label", shape=[self.max_seq_len, 1], dtype='int64') + return [label] + + def _add_loss(self): + labels = fluid.layers.flatten(self.labels[0], axis=2) + ce_loss = fluid.layers.cross_entropy( + input=self.outputs[0], label=labels) + loss = fluid.layers.mean(x=ce_loss) + return loss + + def _add_metrics(self): + self.ret_labels = fluid.layers.reshape(x=self.labels[0], shape=[-1, 1]) + return [self.ret_labels, self.ret_infers, self.seq_len] + + def _calculate_metrics(self, run_states): + total_infer = total_label = total_correct = loss_sum = 0 + run_step = run_time_used = run_examples = 0 + for run_state in run_states: + loss_sum += np.mean(run_state.run_results[-1]) + np_labels = run_state.run_results[0] + np_infers = run_state.run_results[1] + np_lens = run_state.run_results[2] + label_num, infer_num, correct_num = chunk_eval( + np_labels, np_infers, np_lens, self.num_labels, + self.device_count) + total_infer += infer_num + total_label += label_num + total_correct += correct_num + run_examples += run_state.run_examples + run_step += run_state.run_step + + run_time_used = time.time() - run_states[0].run_time_begin + run_speed = run_step / run_time_used + avg_loss = loss_sum / run_examples + + precision, recall, f1 = calculate_f1(total_label, total_infer, + total_correct) + # The first key will be used as main metrics to update the best model + scores = OrderedDict() + + for metric in self.metrics_choices: + if metric == "precision": + scores["precision"] = precision + elif metric == "recall": + scores["recall"] = recall + elif metric == "f1": + scores["f1"] = f1 + else: + raise ValueError("Not Support Metric: \"%s\"" % metric) + + return scores, avg_loss, run_speed + + @property + def feed_list(self): + feed_list = [varname for varname in self._base_feed_list] + if self.is_train_phase or self.is_test_phase: + feed_list += [self.labels[0].name, self.seq_len.name] + else: + feed_list += [self.seq_len.name] + return feed_list + + @property + def fetch_list(self): + if self.is_train_phase or self.is_test_phase: + return [metric.name for metric in self.metrics] + [self.loss.name] + elif self.is_predict_phase: + return [self.ret_infers.name] + [self.seq_len.name] + return [output.name for output in self.outputs] diff --git a/paddlehub/module/manager.py b/paddlehub/module/manager.py index 965b00b62c9e0007918bb3ea8939bd3c8d444dc4..1b1a8d629dc787e10433e96dfba4a171f7485542 100644 --- a/paddlehub/module/manager.py +++ b/paddlehub/module/manager.py @@ -26,6 +26,7 @@ from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import MODULE_HOME from paddlehub.module import module_desc_pb2 import paddlehub as hub +from paddlehub.common.logger import logger class LocalModuleManager(object): @@ -35,23 +36,26 @@ class LocalModuleManager(object): if not os.path.exists(self.local_modules_dir): utils.mkdir(self.local_modules_dir) elif os.path.isfile(self.local_modules_dir): - #TODO(wuzewu): give wanring - pass + raise ValueError("Module home should be a folder, not a file") def check_module_valid(self, module_path): - #TODO(wuzewu): code - info = {} try: desc_pb_path = os.path.join(module_path, 'module_desc.pb') if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path): + info = {} desc = module_desc_pb2.ModuleDesc() with open(desc_pb_path, "rb") as fp: desc.ParseFromString(fp.read()) info['version'] = desc.attr.map.data["module_info"].map.data[ "version"].s + return True, info + else: + logger.warning( + "%s does not exist, the module will be reinstalled" % + desc_pb_path) except: - return False, None - return True, info + pass + return False, None def all_modules(self, update=False): if not update and self.modules_dict: @@ -60,7 +64,6 @@ class LocalModuleManager(object): for sub_dir_name in os.listdir(self.local_modules_dir): sub_dir_path = os.path.join(self.local_modules_dir, sub_dir_name) if os.path.isdir(sub_dir_path): - #TODO(wuzewu): get module name valid, info = self.check_module_valid(sub_dir_path) if valid: module_name = sub_dir_name @@ -92,7 +95,6 @@ class LocalModuleManager(object): url = search_result.get('url', None) md5_value = search_result.get('md5', None) installed_module_version = search_result.get('version', None) - #TODO(wuzewu): add compatibility check if not url or (module_version is not None and installed_module_version != module_version) or (name != module_name): tips = "Can't find module %s" % module_name diff --git a/paddlehub/module/module.py b/paddlehub/module/module.py index 0190c492666d920abe537fc30a16ebc18f292df2..90b8082d55161b7a48285813064d1e421ca2e430 100644 --- a/paddlehub/module/module.py +++ b/paddlehub/module/module.py @@ -117,7 +117,6 @@ class Module(object): self.cache_fetch_dict = None self.cache_program = None - # TODO(wuzewu): print more module loading info log if name: self._init_with_name(name=name, version=version) elif module_dir: @@ -458,7 +457,6 @@ class Module(object): fetch_dict = self.cache_fetch_dict program = self.cache_program - #TODO(wuzewu): more option fetch_list = list(set([value for key, value in fetch_dict.items()])) with fluid.program_guard(program): result = [] @@ -554,7 +552,6 @@ class Module(object): self._recover_variable_info(program) paddle_helper.set_op_attr(program, is_test=for_test) - #TODO(wuzewu): return feed_list and fetch_list directly feed_dict = {} fetch_dict = {} for index, var in enumerate(signature.inputs): @@ -569,7 +566,6 @@ class Module(object): if key: fetch_dict[key] = program.global_block().var(var.name) - # TODO(ZeyuChen) encapsulate into a funtion # update BERT/ERNIE's input tensor's sequence length to max_seq_len if self.name.startswith("bert") or self.name.startswith("ernie"): MAX_SEQ_LENGTH = 512 diff --git a/paddlehub/reader/__init__.py b/paddlehub/reader/__init__.py index 2627f8017fa6f91796e5ade80bdc5f694a7c6e37..6a721ac4e2653e5f143bd8bd04aa40ebe1259441 100644 --- a/paddlehub/reader/__init__.py +++ b/paddlehub/reader/__init__.py @@ -17,4 +17,6 @@ from .nlp_reader import ClassifyReader from .nlp_reader import SequenceLabelReader from .nlp_reader import LACClassifyReader from .nlp_reader import MultiLabelClassifyReader +from .nlp_reader import ReadingComprehensionReader +from .nlp_reader import RegressionReader from .cv_reader import ImageClassificationReader diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index b8abc9942f9572c3a654db04a5252b4456263dd1..afeda5966920be871b541c06f79a80ce52fdb676 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -17,7 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import csv +import collections import json import numpy as np import platform @@ -31,7 +31,7 @@ from paddlehub.reader import tokenization from paddlehub.common.logger import logger from paddlehub.common.utils import sys_stdout_encoding from paddlehub.dataset.dataset import InputExample -from .batching import pad_batch_data +from .batching import pad_batch_data, prepare_batch_data import paddlehub as hub @@ -43,7 +43,8 @@ class BaseReader(object): max_seq_len=512, do_lower_case=True, random_seed=None, - use_task_id=False): + use_task_id=False, + in_tokens=False): self.max_seq_len = max_seq_len self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) @@ -52,7 +53,7 @@ class BaseReader(object): self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] - self.in_tokens = False + self.in_tokens = in_tokens self.use_task_id = use_task_id if self.use_task_id: @@ -202,6 +203,9 @@ class BaseReader(object): return record + def _pad_batch_records(self, batch_records, phase): + raise NotImplementedError + def _prepare_batch_data(self, examples, batch_size, phase=None): """generate batch records""" batch_records, max_len = [], 0 @@ -494,7 +498,7 @@ class SequenceLabelReader(BaseReader): class LACClassifyReader(object): - def __init__(self, dataset, vocab_path): + def __init__(self, dataset, vocab_path, in_tokens=False): self.dataset = dataset self.lac = hub.Module(name="lac") self.tokenizer = tokenization.FullTokenizer( @@ -505,6 +509,7 @@ class LACClassifyReader(object): sign_name="lexical_analysis").keys())[0] self.num_examples = {'train': -1, 'dev': -1, 'test': -1} + self.in_tokens = in_tokens def get_num_examples(self, phase): """Get number of examples for train, dev or test.""" @@ -719,5 +724,536 @@ class MultiLabelClassifyReader(BaseReader): return record +class SquadInputFeatures(object): + """A single set of features of squad_data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +class RegressionReader(BaseReader): + def __init__(self, + dataset, + vocab_path, + label_map_config=None, + max_seq_len=128, + do_lower_case=True, + random_seed=None): + self.max_seq_len = max_seq_len + self.tokenizer = tokenization.FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self.vocab = self.tokenizer.vocab + self.dataset = dataset + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.in_tokens = False + + np.random.seed(random_seed) + + # generate label map + self.label_map = {} # Unlike BaseReader, it's not filled + + self.current_example = 0 + self.current_epoch = 0 + + self.num_examples = {'train': -1, 'dev': -1, 'test': -1} + + def _pad_batch_records(self, batch_records, phase=None): + batch_token_ids = [record.token_ids for record in batch_records] + batch_text_type_ids = [record.text_type_ids for record in batch_records] + batch_position_ids = [record.position_ids for record in batch_records] + + padded_token_ids, input_mask = pad_batch_data( + batch_token_ids, + max_seq_len=self.max_seq_len, + pad_idx=self.pad_id, + return_input_mask=True) + padded_text_type_ids = pad_batch_data( + batch_text_type_ids, + max_seq_len=self.max_seq_len, + pad_idx=self.pad_id) + padded_position_ids = pad_batch_data( + batch_position_ids, + max_seq_len=self.max_seq_len, + pad_idx=self.pad_id) + + if phase != "predict": + batch_labels = [record.label_id for record in batch_records] + # the only diff with ClassifyReader: astype("float32") + batch_labels = np.array(batch_labels).astype("float32").reshape( + [-1, 1]) + + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask, batch_labels + ] + else: + return_list = [ + padded_token_ids, padded_position_ids, padded_text_type_ids, + input_mask + ] + + return return_list + + def data_generator(self, + batch_size=1, + phase='train', + shuffle=True, + data=None): + if phase == 'train': + shuffle = True + examples = self.get_train_examples() + self.num_examples['train'] = len(examples) + elif phase == 'val' or phase == 'dev': + shuffle = False + examples = self.get_dev_examples() + self.num_examples['dev'] = len(examples) + elif phase == 'test': + shuffle = False + examples = self.get_test_examples() + self.num_examples['test'] = len(examples) + elif phase == 'predict': + shuffle = False + examples = [] + seq_id = 0 + + for item in data: + # set label in order to run the program + label = -1 # different from BaseReader + if len(item) == 1: + item_i = InputExample( + guid=seq_id, text_a=item[0], label=label) + elif len(item) == 2: + item_i = InputExample( + guid=seq_id, + text_a=item[0], + text_b=item[1], + label=label) + else: + raise ValueError( + "The length of input_text is out of handling, which must be 1 or 2!" + ) + examples.append(item_i) + seq_id += 1 + else: + raise ValueError( + "Unknown phase, which should be in ['train', 'dev', 'test', 'predict']." + ) + + def wrapper(): + if shuffle: + np.random.shuffle(examples) + + for batch_data in self._prepare_batch_data( + examples, batch_size, phase=phase): + yield [batch_data] + + return wrapper + + +class ReadingComprehensionReader(object): + def __init__(self, + dataset, + vocab_path, + do_lower_case=True, + max_seq_length=512, + doc_stride=128, + max_query_length=64, + random_seed=None): + self.dataset = dataset + self._tokenizer = tokenization.FullTokenizer( + vocab_file=vocab_path, do_lower_case=do_lower_case) + self._max_seq_length = max_seq_length + self._doc_stride = doc_stride + self._max_query_length = max_query_length + self._in_tokens = False + + np.random.seed(random_seed) + + self.vocab = self._tokenizer.vocab + self.vocab_size = len(self.vocab) + self.pad_id = self.vocab["[PAD]"] + self.cls_id = self.vocab["[CLS]"] + self.sep_id = self.vocab["[SEP]"] + self.mask_id = self.vocab["[MASK]"] + + self.current_train_example = 0 + + self.num_examples = {'train': -1, 'dev': -1, 'test': -1} + + def get_train_progress(self): + """Gets progress for training phase.""" + return self.current_train_example + + def get_train_examples(self): + """Gets a collection of `SquadExample`s for the train set.""" + return self.dataset.get_train_examples() + + def get_dev_examples(self): + """Gets a collection of `SquadExample`s for the dev set.""" + return self.dataset.get_dev_examples() + + def get_test_examples(self): + """Gets a collection of `SquadExample`s for prediction.""" + return self.dataset.get_test_examples() + + def get_num_examples(self, phase): + if phase not in ['train', 'dev', 'test']: + raise ValueError( + "Unknown phase, which should be in ['train', 'predict'].") + return self.num_examples[phase] + + def data_generator(self, + batch_size=1, + phase='train', + shuffle=False, + data=None): + if phase == 'train': + shuffle = True + examples = self.get_train_examples() + self.num_examples['train'] = len(examples) + elif phase == 'dev': + shuffle = False + examples = self.get_dev_examples() + self.num_examples['dev'] = len(examples) + elif phase == 'test': + shuffle = False + examples = self.get_test_examples() + self.num_examples['test'] = len(examples) + elif phase == 'predict': + shuffle = False + examples = data + else: + raise ValueError( + "Unknown phase, which should be in ['train', 'dev', 'test', 'predict']." + ) + + def batch_reader(features, batch_size, in_tokens): + batch, total_token_num, max_len = [], 0, 0 + for (index, feature) in enumerate(features): + if phase == 'train': + self.current_train_example = index + 1 + seq_len = len(feature.input_ids) + labels = [feature.unique_id + ] if feature.start_position is None else [ + feature.start_position, feature.end_position + ] + example = [ + feature.input_ids, feature.segment_ids, + range(seq_len) + ] + labels + max_len = max(max_len, seq_len) + + #max_len = max(max_len, len(token_ids)) + if in_tokens: + to_append = (len(batch) + 1) * max_len <= batch_size + else: + to_append = len(batch) < batch_size + + if to_append: + batch.append(example) + total_token_num += seq_len + else: + yield batch, total_token_num + batch, total_token_num, max_len = [example + ], seq_len, seq_len + if len(batch) > 0: + yield batch, total_token_num + + def wrapper(): + if shuffle: + np.random.shuffle(examples) + if phase == "train": + features = self.convert_examples_to_features( + examples, is_training=True) + else: + features = self.convert_examples_to_features( + examples, is_training=False) + + for batch_data, total_token_num in batch_reader( + features, batch_size, self._in_tokens): + batch_data = prepare_batch_data( + batch_data, + total_token_num, + self._max_seq_length, + pad_id=self.pad_id, + cls_id=self.cls_id, + sep_id=self.sep_id, + return_input_mask=True, + return_max_len=False, + return_num_token=False) + + yield [batch_data] + + return wrapper + + def convert_examples_to_features(self, examples, is_training): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = self._tokenizer.tokenize(example.question_text) + + if len(query_tokens) > self._max_query_length: + query_tokens = query_tokens[0:self._max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = self._tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, + tok_end_position) = self.improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, + self._tokenizer, example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = self._max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, self._doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len( + tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = self.check_is_max_context( + doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = self._tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + #while len(input_ids) < max_seq_length: + # input_ids.append(0) + # input_mask.append(0) + # segment_ids.append(0) + + #assert len(input_ids) == max_seq_length + #assert len(input_mask) == max_seq_length + #assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start + and tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 3: + logger.debug("*** Example ***") + logger.debug("unique_id: %s" % (unique_id)) + logger.debug("example_index: %s" % (example_index)) + logger.debug("doc_span_index: %s" % (doc_span_index)) + logger.debug("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + logger.debug("token_to_orig_map: %s" % " ".join([ + "%d:%d" % (x, y) + for (x, y) in six.iteritems(token_to_orig_map) + ])) + logger.debug("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) + for (x, y) in six.iteritems(token_is_max_context) + ])) + logger.debug( + "input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.debug("input_mask: %s" % " ".join( + [str(x) for x in input_mask])) + logger.debug("segment_ids: %s" % " ".join( + [str(x) for x in segment_ids])) + if is_training and example.is_impossible: + logger.debug("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join( + tokens[start_position:(end_position + 1)]) + logger.debug("start_position: %d" % (start_position)) + logger.debug("end_position: %d" % (end_position)) + logger.debug("answer: %s" % + (tokenization.printable_text(answer_text))) + + feature = SquadInputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + unique_id += 1 + + yield feature + + def improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + def check_is_max_context(self, doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, + num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + if __name__ == '__main__': pass diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py index 80c1856b6d21615897109a2b6ec38d4bb5d173df..c8e2bf2067ab7ca9537b03b303ad85f4c2753744 100644 --- a/paddlehub/reader/tokenization.py +++ b/paddlehub/reader/tokenization.py @@ -21,7 +21,6 @@ from __future__ import print_function import collections import io import unicodedata - import six diff --git a/requirements.txt b/requirements.txt index a7444b12a3a5eef5dfd9b580575eab5adf6b399a..5bf659413d84d7e05fa2342549735d44087dbadd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -visualdl >= 1.3.0 pre-commit protobuf >= 3.1.0 yapf == 0.26.0 @@ -12,3 +11,5 @@ requests pandas #[py2]pandas == 0.24.0 flake8 +tb-paddle +cma == 2.7.0 diff --git a/setup.py b/setup.py index e82082d860f1473d200cc4c228209e5e9ea5a20f..92856ba60707ee74f56a8681f79d04e94a53942d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ max_version, mid_version, min_version = python_version() REQUIRED_PACKAGES = [ 'six >= 1.10.0', 'protobuf >= 3.1.0', 'pyyaml', 'Pillow', 'requests', - "visualdl >= 1.3.0" + 'visualdl >= 1.3.0', 'cma == 2.7.0' ] if max_version < 3: