support ChineseGLUE (#217)

* machine reading comprehension

support ChineseGLUE (#217)
* machine reading comprehension
271883bf · kinghuin · wuzewu · 8419f9d5 · 271883bf · 271883bf
29 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -113,3 +113,8 @@ dmypy.json

 # Pyre type checker
 .pyre/
+
+# pycharm
+.DS_Store
+.idea/
+FETCH_HEAD
\ No newline at end of file
--- a/demo/reading-comprehension/predict.py
+++ b/demo/reading-comprehension/predict.py
@@ -33,8 +33,7 @@ import time
 import paddle
 import paddle.fluid as fluid
 import paddlehub as hub
-import evaluate_v1
-import evaluate_v2
+from paddlehub.finetune.task.reading_comprehension_task import write_predictions

 hub.common.logger.logger.setLevel("INFO")

@@ -54,354 +53,36 @@ parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True,
 parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.")
 parser.add_argument("--n_best_size", type=int, default=20, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
 parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.")
-parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+parser.add_argument("--dataset", type=str, default="squad", help="Support squad, squad2.0, drcd and cmrc2018")
 args = parser.parse_args()
 # yapf: enable.

-
-def write_predictions(
-        all_examples,
-        all_features,
-        all_results,
-        output_prediction_file,
-        output_nbest_file,
-        output_null_log_odds_file,
-        n_best_size=20,
-        max_answer_length=30,
-        do_lower_case=True,
-        version_2_with_negative=False,
-        null_score_diff_threshold=0.0,
-):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    print("Writing predictions to: %s" % (output_prediction_file))
-    print("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", [
-            "feature_index", "start_index", "end_index", "start_logit",
-            "end_logit"
-        ])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min mull score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[
-                    0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(
-                    pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(
-                    orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-
-        # if we didn't inlude the empty option in the n-best, inlcude it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-        # debug
-        if best_non_null_entry is None:
-            print("Emmm..., sth wrong")
-
-        probs = compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-
-def get_final_text(pred_text, orig_text, do_lower_case):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = hub.reader.tokenization.BasicTokenizer(
-        do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
 if __name__ == '__main__':
-    # Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model
+    # Download dataset and use ReadingComprehensionReader to read dataset
+    if args.dataset == "squad":
+        dataset = hub.dataset.SQUAD(version_2_with_negative=False)
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+    elif args.dataset == "squad2.0" or args.dataset == "squad2":
+        args.dataset = "squad2.0"
+        dataset = hub.dataset.SQUAD(version_2_with_negative=True)
        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
-    # module = hub.Module(module_dir=["./bert_uncased_L-12_H-768_A-12.hub_module"])
+    elif args.dataset == "drcd":
+        dataset = hub.dataset.DRCD()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+    elif args.dataset == "cmrc2018":
+        dataset = hub.dataset.CMRC2018()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+    else:
+        raise Exception(
+            "Only support datasets: squad, squad2.0, drcd and cmrc2018")
+
    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=args.max_seq_len)

-    # Download dataset and use ReadingComprehensionReader to read dataset
-    dataset = hub.dataset.SQUAD(
-        version_2_with_negative=args.version_2_with_negative)
-
    reader = hub.reader.ReadingComprehensionReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
-        max_seq_length=args.max_seq_len,
+        max_seq_len=args.max_seq_len,
        doc_stride=128,
        max_query_length=64)

@@ -444,82 +125,5 @@ if __name__ == '__main__':
        config=config)

    # Data to be predicted
-    data = dataset.predict_examples
-
-    features = reader.convert_examples_to_features(
-        examples=data, is_training=False)
-    run_states = reading_comprehension_task.predict(data=data)
-    results = [run_state.run_results for run_state in run_states]
-
-    RawResult = collections.namedtuple(
-        "RawResult", ["unique_id", "start_logits", "end_logits"])
-    all_results = []
-    for batch_idx, batch_result in enumerate(results):
-        np_unique_ids = batch_result[0]
-        np_start_logits = batch_result[1]
-        np_end_logits = batch_result[2]
-        np_num_seqs = batch_result[3]
-
-        for idx in range(np_unique_ids.shape[0]):
-            unique_id = int(np_unique_ids[idx])
-            start_logits = [float(x) for x in np_start_logits[idx].flat]
-            end_logits = [float(x) for x in np_end_logits[idx].flat]
-            all_results.append(
-                RawResult(
-                    unique_id=unique_id,
-                    start_logits=start_logits,
-                    end_logits=end_logits))
-
-    output_prediction_file = os.path.join(args.result_dir, "predictions.json")
-    output_nbest_file = os.path.join(args.result_dir, "nbest_predictions.json")
-    output_null_log_odds_file = os.path.join(args.result_dir, "null_odds.json")
-
-    write_predictions(
-        data,
-        features,
-        all_results,
-        output_prediction_file,
-        output_nbest_file,
-        output_null_log_odds_file,
-        max_answer_length=args.max_answer_length,
-        n_best_size=args.n_best_size,
-        version_2_with_negative=args.version_2_with_negative,
-        null_score_diff_threshold=args.null_score_diff_threshold)
-
-    with io.open(dataset.predict_file, 'r', encoding="utf8") as dataset_file:
-        dataset_json = json.load(dataset_file)
-        dataset = dataset_json['data']
-
-    with io.open(
-            output_prediction_file, 'r', encoding="utf8") as prediction_file:
-        predictions = json.load(prediction_file)
-
-    if not args.version_2_with_negative:
-        print(json.dumps(evaluate_v1.evaluate(dataset, predictions)))
-    else:
-        with io.open(
-                output_null_log_odds_file, 'r', encoding="utf8") as odds_file:
-            na_probs = json.load(odds_file)
-        # Maps qid to true/false
-        qid_to_has_ans = evaluate_v2.make_qid_to_has_ans(dataset)
-        has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-        no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-        exact_raw, f1_raw = evaluate_v2.get_raw_scores(dataset, predictions)
-        exact_thresh = evaluate_v2.apply_no_ans_threshold(
-            exact_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
-        f1_thresh = evaluate_v2.apply_no_ans_threshold(
-            f1_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
-        out_eval = evaluate_v2.make_eval_dict(exact_thresh, f1_thresh)
-
-        if has_ans_qids:
-            has_ans_eval = evaluate_v2.make_eval_dict(
-                exact_thresh, f1_thresh, qid_list=has_ans_qids)
-            evaluate_v2.merge_eval(out_eval, has_ans_eval, 'HasAns')
-        if no_ans_qids:
-            no_ans_eval = evaluate_v2.make_eval_dict(
-                exact_thresh, f1_thresh, qid_list=no_ans_qids)
-            evaluate_v2.merge_eval(out_eval, no_ans_eval, 'NoAns')
-
-        evaluate_v2.find_all_best_thresh(out_eval, predictions, exact_raw,
-                                         f1_raw, na_probs, qid_to_has_ans)
-        print(json.dumps(out_eval, indent=4))
+    data = dataset.dev_examples[97:98]
+    reading_comprehension_task.predict(data=data)
--- a/demo/reading-comprehension/reading_comprehension.py
+++ b/demo/reading-comprehension/reading_comprehension.py
@@ -31,28 +31,42 @@ parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight dec
 parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
 parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
 parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
+parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.")
+parser.add_argument("--n_best_size",  type=int, default=20,help="The total number of n-best predictions to generate in the ""nbest_predictions.json output file.")
+parser.add_argument("--max_answer_length",  type=int, default=30,help="The maximum length of an answer that can be generated. This is needed ""because the start and end predictions are not conditioned on one another.")
 parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
-parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.")
-parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
-parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
+parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
+parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
+parser.add_argument("--dataset", type=str, default="squad", help="Support squad, squad2.0, drcd and cmrc2018")
 args = parser.parse_args()
 # yapf: enable.

 if __name__ == '__main__':
-    # Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model
+    # Download dataset and use ReadingComprehensionReader to read dataset
+    if args.dataset == "squad":
+        dataset = hub.dataset.SQUAD(version_2_with_negative=False)
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+    elif args.dataset == "squad2.0" or args.dataset == "squad2":
+        args.dataset = "squad2.0"
+        dataset = hub.dataset.SQUAD(version_2_with_negative=True)
        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+    elif args.dataset == "drcd":
+        dataset = hub.dataset.DRCD()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+    elif args.dataset == "cmrc2018":
+        dataset = hub.dataset.CMRC2018()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+    else:
+        raise Exception(
+            "Only support datasets: squad, squad2.0, drcd and cmrc2018")

    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=args.max_seq_len)

-    # Download dataset and use ReadingComprehensionReader to read dataset
-    dataset = hub.dataset.SQUAD(
-        version_2_with_negative=args.version_2_with_negative)
-
    reader = hub.reader.ReadingComprehensionReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
-        max_seq_length=args.max_seq_len,
+        max_seq_len=args.max_seq_len,
        doc_stride=128,
        max_query_length=64)

@@ -76,9 +90,10 @@ if __name__ == '__main__':
    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        log_interval=10,
+        eval_interval=300,
+        save_ckpt_interval=10000,
        use_pyreader=args.use_pyreader,
        use_data_parallel=args.use_data_parallel,
-        save_ckpt_interval=1000,
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
@@ -91,7 +106,9 @@ if __name__ == '__main__':
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
-        config=config)
+        config=config,
+        sub_task=args.dataset,
+    )

    # Finetune by PaddleHub's API
-    reading_comprehension_task.finetune()
+    reading_comprehension_task.finetune_and_eval()
--- a/demo/reading-comprehension/run_finetune.sh
+++ b/demo/reading-comprehension/run_finetune.sh
 export FLAGS_eager_delete_tensor_gb=0.0
-export CUDA_VISIBLE_DEVICES=0,1

+# Recommending hyper parameters for difference task
+# squad: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
+# squad2.0: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
+# cmrc2018: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5
+# drcd: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5
+
+dataset=cmrc2018
 python -u reading_comprehension.py \
-                   --batch_size=12 \
+                   --batch_size=8 \
                   --use_gpu=True \
-                   --checkpoint_dir="./ckpt_rc" \
-                   --learning_rate=3e-5 \
+                   --checkpoint_dir=./ckpt_${dataset} \
+                   --learning_rate=2.5e-5 \
                   --weight_decay=0.01 \
                   --warmup_proportion=0.1 \
                   --num_epoch=2 \
-                   --max_seq_len=384 \
-                   --use_pyreader=True \
-                   --use_data_parallel=True \
-                   --version_2_with_negative=False
+                   --max_seq_len=512 \
+                   --dataset=${dataset}
--- a/demo/reading-comprehension/run_predict.sh
+++ b/demo/reading-comprehension/run_predict.sh
 export FLAGS_eager_delete_tensor_gb=0.0
-export CUDA_VISIBLE_DEVICES=0

-CKPT_DIR="./ckpt_rc"
-RES_DIR="./result"
-
-mkdir $RES_DIR
+CKPT_DIR="./ckpt_cmrc2018"
+dataset=cmrc2018

 python -u  predict.py \
-                   --batch_size=12 \
+                   --batch_size=8 \
                   --use_gpu=True \
+                   --dataset=${dataset} \
                   --checkpoint_dir=${CKPT_DIR} \
-                   --learning_rate=3e-5 \
+                   --learning_rate=2.5e-5 \
                   --weight_decay=0.01 \
                   --warmup_proportion=0.1 \
                   --num_epoch=1 \
-                   --max_seq_len=384 \
+                   --max_seq_len=512 \
                   --use_pyreader=False \
-                   --use_data_parallel=False \
-                   --version_2_with_negative=False \
-                   --result_dir=${RES_DIR}
+                   --use_data_parallel=False
--- a/demo/sequence-labeling/run_sequence_label.sh
+++ b/demo/sequence-labeling/run_sequence_label.sh
 export FLAGS_eager_delete_tensor_gb=0.0
-export CUDA_VISIBLE_DEVICES=0

 CKPT_DIR="./ckpt_sequence_label"
 python -u sequence_label.py \

--- a/demo/sequence-labeling/sequence_label.py
+++ b/demo/sequence-labeling/sequence_label.py
@@ -37,7 +37,7 @@ args = parser.parse_args()

 if __name__ == '__main__':
    # Load Paddlehub ERNIE pretrained model
-    module = hub.Module(name="ernie")
+    module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=args.max_seq_len)

@@ -69,6 +69,9 @@ if __name__ == '__main__':

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
+        log_interval=10,
+        eval_interval=300,
+        save_ckpt_interval=10000,
        use_data_parallel=args.use_data_parallel,
        use_pyreader=args.use_pyreader,
        use_cuda=args.use_gpu,

--- a/demo/text-classification/README.md
+++ b/demo/text-classification/README.md
@@ -4,19 +4,24 @@
 其中分类任务可以分为两大类：

 * **单句分类**
-  - ChnSentiCorp
+  - ChineseGLUE-IFLYTEK
+  - ChineseGLUE-THUCNEWS
  - GLUE-Cola
  - GLUE-SST2
-
+  - ChnSentiCorp
 * **句对分类**
-  - LCQMC
-  - NLPCC-DBQA
+  - ChineseGLUE-LCQMC
+  - ChineseGLUE-INEWS
+  - ChineseGLUE-TNEWS
+  - ChinesGLUE-BQ
+  - ChineseGLUE-XNLI_zh
  - GLUE-MNLI
  - GLUE-QQP
  - GLUE-QNLI
  - GLUE-STS-B
  - GLUE-MRPC
  - GLUE-RTE
+  - NLPCC-DBQA
  - XNLI

 ## 如何开始Finetune

--- a/demo/text-classification/run_classifier.sh
+++ b/demo/text-classification/run_classifier.sh
@@ -5,11 +5,36 @@ export CUDA_VISIBLE_DEVICES=0
 DATASET="chnsenticorp"
 CKPT_DIR="./ckpt_${DATASET}"

+python -u text_classifier.py \
+                   --batch_size=24 \
+                   --use_gpu=True \
+                   --dataset=${DATASET} \
+                   --checkpoint_dir=${CKPT_DIR} \
+                   --learning_rate=5e-5 \
+                   --weight_decay=0.01 \
+                   --max_seq_len=128 \
+                   --num_epoch=3 \
+                   --use_pyreader=True \
+                   --use_data_parallel=True \
+                   --use_taskid=False
+
 # Recommending hyper parameters for difference task
+# for ChineseGLUE:
+# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
+# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
+# XNLI_zh: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5
+# INEWS: batch_size=4, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
+# DRCD: see demo: reading-comprehension
+# CMRC2018: see demo: reading-comprehension
+# BQ: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=100, lr=1e-5
+# MSRANER: see demo: sequence-labeling
+# THUCNEWS: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=5e-5
+# IFLYTEKDATA: batch_size=16, weight_decay=0, num_epoch=5, max_seq_len=256, lr=1e-5
+
+# for other tasks:
 # ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5
 # NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5
 # LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5
-# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
 # QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
 # QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
 # SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
@@ -22,23 +47,10 @@ CKPT_DIR="./ckpt_${DATASET}"
 #       mnli_mm: dev and test in mismatched dataset.
 #      The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf.
 #       If you are not sure which one to pick, just use mnli or mnli_m.
-# XNLI: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5
+# XNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
 #       Specify the language with an underscore like xnli_zh.
 #       ar- Arabic      bg- Bulgarian      de- German
 #       el- Greek       en- English        es- Spanish
 #       fr- French      hi- Hindi          ru- Russian
 #       sw- Swahili     th- Thai           tr- Turkish
 #       ur- Urdu        vi- Vietnamese     zh- Chinese (Simplified)
-
-python -u text_classifier.py \
-                   --batch_size=24 \
-                   --use_gpu=True \
-                   --dataset=${DATASET} \
-                   --checkpoint_dir=${CKPT_DIR} \
-                   --learning_rate=5e-5 \
-                   --weight_decay=0.01 \
-                   --max_seq_len=128 \
-                   --num_epoch=3 \
-                   --use_pyreader=True \
-                   --use_data_parallel=True \
-                   --use_taskid=False \
--- a/demo/text-classification/text_classifier.py
+++ b/demo/text-classification/text_classifier.py
@@ -43,20 +43,36 @@ if __name__ == '__main__':
    # Download dataset and use ClassifyReader to read dataset
    if args.dataset.lower() == "chnsenticorp":
        dataset = hub.dataset.ChnSentiCorp()
-        module = hub.Module(name="ernie")
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
        metrics_choices = ["acc"]
    elif args.dataset.lower() == "tnews":
        dataset = hub.dataset.TNews()
-        module = hub.Module(name="ernie")
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
        metrics_choices = ["acc", "f1"]
    elif args.dataset.lower() == "nlpcc_dbqa":
        dataset = hub.dataset.NLPCC_DBQA()
-        module = hub.Module(name="ernie")
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
        metrics_choices = ["acc"]
    elif args.dataset.lower() == "lcqmc":
        dataset = hub.dataset.LCQMC()
-        module = hub.Module(name="ernie")
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
        metrics_choices = ["acc"]
+    elif args.dataset.lower() == 'inews':
+        dataset = hub.dataset.INews()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+        metrics_choices = ["acc", "f1"]
+    elif args.dataset.lower() == 'bq':
+        dataset = hub.dataset.BQ()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+        metrics_choices = ["acc", "f1"]
+    elif args.dataset.lower() == 'thucnews':
+        dataset = hub.dataset.THUCNEWS()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+        metrics_choices = ["acc", "f1"]
+    elif args.dataset.lower() == 'iflytek':
+        dataset = hub.dataset.IFLYTEK()
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
+        metrics_choices = ["acc", "f1"]
    elif args.dataset.lower() == "mrpc":
        dataset = hub.dataset.GLUE("MRPC")
        if args.use_taskid:
@@ -116,7 +132,7 @@ if __name__ == '__main__':
        metrics_choices = ["acc"]
    elif args.dataset.lower().startswith("xnli"):
        dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:])
-        module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
+        module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
        metrics_choices = ["acc"]
    else:
        raise ValueError("%s dataset is not defined" % args.dataset)
@@ -140,7 +156,7 @@ if __name__ == '__main__':
    pooled_output = outputs["pooled_output"]

    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
+    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,

--- a/paddlehub/dataset/__init__.py
+++ b/paddlehub/dataset/__init__.py
@@ -24,6 +24,12 @@ from .squad import SQUAD
 from .xnli import XNLI
 from .glue import GLUE
 from .tnews import TNews
+from .inews import INews
+from .drcd import DRCD
+from .cmrc2018 import CMRC2018
+from .bq import BQ
+from .iflytek import IFLYTEK
+from .thucnews import THUCNEWS

 # CV Dataset
 from .dogcat import DogCatDataset as DogCat

--- a/paddlehub/dataset/bq.py
+++ b/paddlehub/dataset/bq.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import io
+import os
+import csv
+
+from paddlehub.dataset import InputExample, HubDataset
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/bq.tar.gz"
+
+
+class BQ(HubDataset):
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "bq")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_test_examples()
+        self._load_dev_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "train.txt")
+        self.train_examples = self._read_file(self.train_file)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
+        self.dev_examples = self._read_file(self.dev_file)
+
+    def _load_test_examples(self):
+        self.test_file = os.path.join(self.dataset_dir, "test.txt")
+        self.test_examples = self._read_file(self.test_file)
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return self.test_examples
+
+    def get_labels(self):
+        return ["0", "1"]
+
+    @property
+    def num_labels(self):
+        """
+        Return the number of labels in the dataset.
+        """
+        return len(self.get_labels())
+
+    def _read_file(self, input_file):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="UTF-8") as file:
+            examples = []
+            for (i, line) in enumerate(file):
+                data = line.strip().split("\t")
+                example = InputExample(
+                    guid=i, label=data[2], text_a=data[0], text_b=data[1])
+                examples.append(example)
+            return examples
+
+
+if __name__ == "__main__":
+    ds = BQ()
+    for e in ds.get_dev_examples()[:10]:
+        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
--- a/paddlehub/dataset/cmrc2018.py
+++ b/paddlehub/dataset/cmrc2018.py
+# coding:utf-8
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on cmrc2018"""
+
+import json
+import os
+import sys
+
+from paddlehub.reader import tokenization
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
+SPIECE_UNDERLINE = '▁'
+
+
+class CMRC2018Example(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (tokenization.printable_text(
+            self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position is not None:
+            s += ", orig_answer_text: %s" % (self.orig_answer_text)
+            s += ", start_position: %d" % (self.start_position)
+            s += ", end_position: %d" % (self.end_position)
+        return s
+
+
+class CMRC2018(object):
+    """A single set of features of data."""
+
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_dev_examples()
+        self._load_test_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "cmrc2018_train.json")
+        self.train_examples = self._read_json(self.train_file, is_training=True)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "cmrc2018_dev.json")
+        self.dev_examples = self._read_json(self.dev_file, is_training=False)
+
+    def _load_test_examples(self):
+        pass
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return []
+
+    def _read_json(self, input_file, is_training=False):
+        """Read a cmrc2018 json file into a list of CRCDExample."""
+
+        def _is_chinese_char(cp):
+            if ((cp >= 0x4E00 and cp <= 0x9FFF)
+                    or (cp >= 0x3400 and cp <= 0x4DBF)
+                    or (cp >= 0x20000 and cp <= 0x2A6DF)
+                    or (cp >= 0x2A700 and cp <= 0x2B73F)
+                    or (cp >= 0x2B740 and cp <= 0x2B81F)
+                    or (cp >= 0x2B820 and cp <= 0x2CEAF)
+                    or (cp >= 0xF900 and cp <= 0xFAFF)
+                    or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+                return True
+            return False
+
+        def _is_punctuation(c):
+            if c in [
+                    '。', '，', '！', '？', '；', '、', '：', '（', '）', '－', '~', '「',
+                    '《', '》', ',', '」', '"', '“', '”', '$', '『', '』', '—', ';',
+                    '。', '(', ')', '-', '～', '。', '‘', '’', '─', ':'
+            ]:
+                return True
+            return False
+
+        def _tokenize_chinese_chars(text):
+            """Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
+            characters, we add spaces around every character in the CJK Unicode range before
+            applying WordPiece. This means that Chinese is effectively character-tokenized.
+            Note that the CJK Unicode block only includes Chinese-origin characters and
+            does not include Hangul Korean or Katakana/Hiragana Japanese, which are tokenized
+            with whitespace+WordPiece like all other languages."""
+            output = []
+            for char in text:
+                cp = ord(char)
+                if _is_chinese_char(cp) or _is_punctuation(char):
+                    if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
+                        output.append(SPIECE_UNDERLINE)
+                    output.append(char)
+                    output.append(SPIECE_UNDERLINE)
+                else:
+                    output.append(char)
+            return "".join(output)
+
+        def is_whitespace(c):
+            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
+                    c) == 0x202F or ord(c) == 0x3000 or c == SPIECE_UNDERLINE:
+                return True
+            return False
+
+        examples = []
+        drop = 0
+        with open(input_file, "r") as reader:
+            input_data = json.load(reader)["data"]
+        for entry in input_data:
+            for paragraph in entry["paragraphs"]:
+                paragraph_text = paragraph["context"]
+                context = _tokenize_chinese_chars(paragraph_text)
+
+                doc_tokens = []
+                char_to_word_offset = []
+                prev_is_whitespace = True
+                for c in context:
+                    if is_whitespace(c):
+                        prev_is_whitespace = True
+                    else:
+                        if prev_is_whitespace:
+                            doc_tokens.append(c)
+                        else:
+                            doc_tokens[-1] += c
+                        prev_is_whitespace = False
+                    if c != SPIECE_UNDERLINE:
+                        char_to_word_offset.append(len(doc_tokens) - 1)
+
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+
+                    # Only select the first answer
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["answer_start"]
+                    while paragraph_text[answer_offset] in [
+                            " ", "\t", "\r", "\n", "。", "，", "：", ":", ".", ","
+                    ]:
+                        answer_offset += 1
+                    start_position = char_to_word_offset[answer_offset]
+                    answer_length = len(orig_answer_text)
+
+                    end_offset = answer_offset + answer_length - 1
+                    if end_offset >= len(char_to_word_offset):
+                        end_offset = len(char_to_word_offset) - 1
+                    end_position = char_to_word_offset[end_offset]
+
+                    # Only add answers where the text can be exactly recovered from the
+                    # document. If this CAN'T happen it's likely due to weird Unicode
+                    # stuff so we will just skip the example.
+                    #
+                    # Note that this means for training mode, every example is NOT
+                    # guaranteed to be preserved.
+                    if is_training:
+                        actual_text = "".join(
+                            doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = "".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            drop += 1
+                            # logger.warning((actual_text, " vs ",
+                            #                 cleaned_answer_text, " in ", qa))
+                            continue
+                    example = CMRC2018Example(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        doc_tokens=doc_tokens,
+                        orig_answer_text=orig_answer_text,
+                        start_position=start_position,
+                        end_position=end_position)
+                    examples.append(example)
+
+        logger.warning("%i bad examples has been dropped" % drop)
+        return examples
+
+
+if __name__ == "__main__":
+    print("begin")
+    ds = CMRC2018()
+    print("train")
+    examples = ds.get_train_examples()
+    for index, e in enumerate(examples):
+        if index < 10:
+            print(e)
+    print("dev")
+    examples = ds.get_dev_examples()
+    for index, e in enumerate(examples):
+        if index < 10:
+            print(e)
--- a/paddlehub/dataset/drcd.py
+++ b/paddlehub/dataset/drcd.py
+# coding:utf-8
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on DRCD"""
+
+import json
+import os
+import sys
+
+from paddlehub.reader import tokenization
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
+SPIECE_UNDERLINE = '▁'
+
+
+class DRCDExample(object):
+    """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (tokenization.printable_text(
+            self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position is not None:
+            s += ", orig_answer_text: %s" % (self.orig_answer_text)
+            s += ", start_position: %d" % (self.start_position)
+            s += ", end_position: %d" % (self.end_position)
+        return s
+
+
+class DRCD(object):
+    """A single set of features of data."""
+
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "drcd")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_dev_examples()
+        self._load_test_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "DRCD_training.json")
+        self.train_examples = self._read_json(self.train_file)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "DRCD_dev.json")
+        self.dev_examples = self._read_json(self.dev_file)
+
+    def _load_test_examples(self):
+        self.test_file = os.path.join(self.dataset_dir, "DRCD_test.json")
+        self.test_examples = self._read_json(self.test_file)
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return self.test_examples
+
+    def _read_json(self, input_file):
+        """Read a DRCD json file into a list of CRCDExample."""
+
+        def _is_chinese_char(cp):
+            if ((cp >= 0x4E00 and cp <= 0x9FFF)
+                    or (cp >= 0x3400 and cp <= 0x4DBF)
+                    or (cp >= 0x20000 and cp <= 0x2A6DF)
+                    or (cp >= 0x2A700 and cp <= 0x2B73F)
+                    or (cp >= 0x2B740 and cp <= 0x2B81F)
+                    or (cp >= 0x2B820 and cp <= 0x2CEAF)
+                    or (cp >= 0xF900 and cp <= 0xFAFF)
+                    or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+                return True
+            return False
+
+        def _is_punctuation(c):
+            if c in [
+                    '。', '，', '！', '？', '；', '、', '：', '（', '）', '－', '~', '「',
+                    '《', '》', ',', '」', '"', '“', '”', '$', '『', '』', '—', ';',
+                    '。', '(', ')', '-', '～', '。', '‘', '’', '─', ':'
+            ]:
+                return True
+            return False
+
+        def _tokenize_chinese_chars(text):
+            """Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace
+            characters, we add spaces around every character in the CJK Unicode range before
+            applying WordPiece. This means that Chinese is effectively character-tokenized.
+            Note that the CJK Unicode block only includes Chinese-origin characters and
+            does not include Hangul Korean or Katakana/Hiragana Japanese, which are tokenized
+            with whitespace+WordPiece like all other languages."""
+            output = []
+            for char in text:
+                cp = ord(char)
+                if _is_chinese_char(cp) or _is_punctuation(char):
+                    if len(output) > 0 and output[-1] != SPIECE_UNDERLINE:
+                        output.append(SPIECE_UNDERLINE)
+                    output.append(char)
+                    output.append(SPIECE_UNDERLINE)
+                else:
+                    output.append(char)
+            return "".join(output)
+
+        def is_whitespace(c):
+            if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
+                    c) == 0x202F or ord(c) == 0x3000 or c == SPIECE_UNDERLINE:
+                return True
+            return False
+
+        examples = []
+        with open(input_file, "r") as reader:
+            input_data = json.load(reader)["data"]
+        for entry in input_data:
+            for paragraph in entry["paragraphs"]:
+                paragraph_text = paragraph["context"]
+                context = _tokenize_chinese_chars(paragraph_text)
+
+                doc_tokens = []
+                char_to_word_offset = []
+                prev_is_whitespace = True
+                for c in context:
+                    if is_whitespace(c):
+                        prev_is_whitespace = True
+                    else:
+                        if prev_is_whitespace:
+                            doc_tokens.append(c)
+                        else:
+                            doc_tokens[-1] += c
+                        prev_is_whitespace = False
+                    if c != SPIECE_UNDERLINE:
+                        char_to_word_offset.append(len(doc_tokens) - 1)
+
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+
+                    # Only select the first answer
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["answer_start"]
+                    while paragraph_text[answer_offset] in [
+                            " ", "\t", "\r", "\n", "。", "，", "：", ":", ".", ","
+                    ]:
+                        answer_offset += 1
+                    start_position = char_to_word_offset[answer_offset]
+                    answer_length = len(orig_answer_text)
+                    end_position = char_to_word_offset[answer_offset +
+                                                       answer_length - 1]
+                    # Only add answers where the text can be exactly recovered from the
+                    # document. If this CAN'T happen it's likely due to weird Unicode
+                    # stuff so we will just skip the example.
+                    #
+                    # Note that this means for training mode, every example is NOT
+                    # guaranteed to be preserved.
+                    actual_text = "".join(
+                        doc_tokens[start_position:(end_position + 1)])
+                    cleaned_answer_text = "".join(
+                        tokenization.whitespace_tokenize(orig_answer_text))
+                    if actual_text.find(cleaned_answer_text) == -1:
+                        logger.warning((actual_text, " vs ",
+                                        cleaned_answer_text, " in ", qa))
+                        continue
+                    example = DRCDExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        doc_tokens=doc_tokens,
+                        orig_answer_text=orig_answer_text,
+                        start_position=start_position,
+                        end_position=end_position)
+                    examples.append(example)
+        return examples
+
+
+if __name__ == "__main__":
+    ds = DRCD()
+    print("train")
+    examples = ds.get_train_examples()
+    for index, e in enumerate(examples):
+        if index < 10:
+            print(e)
+    print("dev")
+    examples = ds.get_dev_examples()
+    for index, e in enumerate(examples):
+        if index < 10:
+            print(e)
+    print("test")
+    examples = ds.get_test_examples()
+    for index, e in enumerate(examples):
+        if index < 10:
+            print(e)
--- a/paddlehub/dataset/iflytek.py
+++ b/paddlehub/dataset/iflytek.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import io
+import os
+import csv
+
+from paddlehub.dataset import InputExample, HubDataset
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
+
+
+class IFLYTEK(HubDataset):
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "iflytek")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_test_examples()
+        self._load_dev_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "train.txt")
+        self.train_examples = self._read_file(self.train_file)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
+        self.dev_examples = self._read_file(self.dev_file)
+
+    def _load_test_examples(self):
+        self.test_file = os.path.join(self.dataset_dir, "test.txt")
+        self.test_examples = self._read_file(self.test_file)
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return self.test_examples
+
+    def get_labels(self):
+        return [str(i) for i in range(119)]
+
+    @property
+    def num_labels(self):
+        """
+        Return the number of labels in the dataset.
+        """
+        return len(self.get_labels())
+
+    def _read_file(self, input_file):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="UTF-8") as file:
+            examples = []
+            for (i, line) in enumerate(file):
+                data = line.strip().split("_!_")
+                try:
+                    example = InputExample(
+                        guid=i, label=str(data[0]), text_a=data[1], text_b=None)
+                    examples.append(example)
+                except:
+                    pass
+            return examples
+
+
+if __name__ == "__main__":
+    ds = IFLYTEK()
+    for e in ds.get_train_examples()[:10]:
+        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
--- a/paddlehub/dataset/inews.py
+++ b/paddlehub/dataset/inews.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import io
+import os
+import csv
+
+from paddlehub.dataset import InputExample, HubDataset
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
+
+
+class INews(HubDataset):
+    """
+    INews is a sentiment analysis dataset for Internet News
+    """
+
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "inews")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_test_examples()
+        self._load_dev_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "train.txt")
+        self.train_examples = self._read_file(self.train_file)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
+        self.dev_examples = self._read_file(self.dev_file)
+
+    def _load_test_examples(self):
+        self.test_file = os.path.join(self.dataset_dir, "test.txt")
+        self.test_examples = self._read_file(self.test_file)
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return self.test_examples
+
+    def get_labels(self):
+        return ["0", "1", "2"]
+
+    @property
+    def num_labels(self):
+        """
+        Return the number of labels in the dataset.
+        """
+        return len(self.get_labels())
+
+    def _read_file(self, input_file):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="UTF-8") as file:
+            examples = []
+            for (i, line) in enumerate(file):
+                if i == 0:
+                    continue
+                data = line.strip().split("_!_")
+                example = InputExample(
+                    guid=i, label=data[0], text_a=data[2], text_b=data[3])
+                examples.append(example)
+            return examples
+
+
+if __name__ == "__main__":
+    ds = INews()
+    for e in ds.get_train_examples()[:10]:
+        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
--- a/paddlehub/dataset/squad.py
+++ b/paddlehub/dataset/squad.py
@@ -76,42 +76,50 @@ class SQUAD(object):
                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
        else:
            logger.info("Dataset {} already cached.".format(self.dataset_dir))
-
-        self._load_train_examples(version_2_with_negative, is_training=True)
-        self._load_predict_examples(version_2_with_negative, is_training=False)
+        self.version_2_with_negative = version_2_with_negative
+        self._load_train_examples(version_2_with_negative, if_has_answer=True)
+        self._load_dev_examples(version_2_with_negative, if_has_answer=True)

    def _load_train_examples(self,
                             version_2_with_negative=False,
-                             is_training=True):
+                             if_has_answer=True):
        if not version_2_with_negative:
            self.train_file = os.path.join(self.dataset_dir, "train-v1.1.json")
        else:
            self.train_file = os.path.join(self.dataset_dir, "train-v2.0.json")

-        self.train_examples = self._read_json(self.train_file, is_training,
+        self.train_examples = self._read_json(self.train_file, if_has_answer,
                                              version_2_with_negative)

-    def _load_predict_examples(self,
+    def _load_dev_examples(self,
                           version_2_with_negative=False,
-                               is_training=False):
+                           if_has_answer=True):
        if not version_2_with_negative:
-            self.predict_file = os.path.join(self.dataset_dir, "dev-v1.1.json")
+            self.dev_file = os.path.join(self.dataset_dir, "dev-v1.1.json")
        else:
-            self.predict_file = os.path.join(self.dataset_dir, "dev-v2.0.json")
+            self.dev_file = os.path.join(self.dataset_dir, "dev-v2.0.json")

-        self.predict_examples = self._read_json(self.predict_file, is_training,
+        self.dev_examples = self._read_json(self.dev_file, if_has_answer,
                                            version_2_with_negative)

+    def _load_test_examples(self,
+                            version_2_with_negative=False,
+                            is_training=False):
+        self.test_file = None
+        logger.error("not test_file")
+
    def get_train_examples(self):
        return self.train_examples

    def get_dev_examples(self):
-        return []
+        return self.dev_examples

    def get_test_examples(self):
        return []

-    def _read_json(self, input_file, is_training,
+    def _read_json(self,
+                   input_file,
+                   if_has_answer,
                   version_2_with_negative=False):
        """Read a SQuAD json file into a list of SquadExample."""
        with open(input_file, "r") as reader:
@@ -148,14 +156,13 @@ class SQUAD(object):
                    end_position = None
                    orig_answer_text = None
                    is_impossible = False
-                    if is_training:
-
+                    if if_has_answer:
                        if version_2_with_negative:
                            is_impossible = qa["is_impossible"]
-                        if (len(qa["answers"]) != 1) and (not is_impossible):
-                            raise ValueError(
-                                "For training, each question should have exactly 1 answer."
-                            )
+                        # if (len(qa["answers"]) != 1) and (not is_impossible):
+                        #     raise ValueError(
+                        #         "For training, each question should have exactly 1 answer."
+                        #     )
                        if not is_impossible:
                            answer = qa["answers"][0]
                            orig_answer_text = answer["text"]
@@ -177,8 +184,8 @@ class SQUAD(object):
                                    orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                logger.warning(
-                                    "Could not find answer: '%s' vs. '%s'",
-                                    actual_text, cleaned_answer_text)
+                                    "Could not find answer: '%s' vs. '%s'" %
+                                    (actual_text, cleaned_answer_text))
                                continue
                        else:
                            start_position = -1
@@ -199,8 +206,8 @@ class SQUAD(object):


 if __name__ == "__main__":
-    ds = SQUAD(version_2_with_negative=True)
-    examples = ds.get_dev_examples()
+    ds = SQUAD(version_2_with_negative=False)
+    examples = ds.get_train_examples()
    for index, e in enumerate(examples):
        if index < 10:
            print(e)
--- a/paddlehub/dataset/thucnews.py
+++ b/paddlehub/dataset/thucnews.py
+# coding:utf-8
+# Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import io
+import os
+import csv
+
+from paddlehub.dataset import InputExample, HubDataset
+from paddlehub.common.downloader import default_downloader
+from paddlehub.common.dir import DATA_HOME
+from paddlehub.common.logger import logger
+
+_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
+
+
+class THUCNEWS(HubDataset):
+    def __init__(self):
+        self.dataset_dir = os.path.join(DATA_HOME, "thucnews")
+        if not os.path.exists(self.dataset_dir):
+            ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
+                url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
+        else:
+            logger.info("Dataset {} already cached.".format(self.dataset_dir))
+
+        self._load_train_examples()
+        self._load_test_examples()
+        self._load_dev_examples()
+
+    def _load_train_examples(self):
+        self.train_file = os.path.join(self.dataset_dir, "train.txt")
+        self.train_examples = self._read_file(self.train_file)
+
+    def _load_dev_examples(self):
+        self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
+        self.dev_examples = self._read_file(self.dev_file)
+
+    def _load_test_examples(self):
+        self.test_file = os.path.join(self.dataset_dir, "test.txt")
+        self.test_examples = self._read_file(self.test_file)
+
+    def get_train_examples(self):
+        return self.train_examples
+
+    def get_dev_examples(self):
+        return self.dev_examples
+
+    def get_test_examples(self):
+        return self.test_examples
+
+    def get_labels(self):
+        return [str(i) for i in range(14)]
+
+    @property
+    def num_labels(self):
+        """
+        Return the number of labels in the dataset.
+        """
+        return len(self.get_labels())
+
+    def _read_file(self, input_file):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="UTF-8") as file:
+            examples = []
+            for (i, line) in enumerate(file):
+                data = line.strip().split("_!_")
+                try:
+                    example = InputExample(
+                        guid=i, label=data[0], text_a=data[3], text_b=None)
+                    examples.append(example)
+                except:
+                    pass
+            return examples
+
+
+if __name__ == "__main__":
+    ds = THUCNEWS()
+    for e in ds.get_train_examples()[:10]:
+        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
--- a/paddlehub/dataset/tnews.py
+++ b/paddlehub/dataset/tnews.py
@@ -32,7 +32,7 @@ _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"

 class TNews(HubDataset):
    """
-    TNews is the chinese news classification dataset on JinRiTouDiao App.
+    TNews is the chinese news classification dataset on Jinri Toutiao App.
    """

    def __init__(self):

--- a/paddlehub/finetune/evaluate.py
+++ b/paddlehub/finetune/evaluate.py
@@ -17,14 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import time
-
-import paddle.fluid as fluid
 import numpy as np

-from paddlehub.common.logger import logger
-import paddlehub as hub
-

 # Sequence label evaluation functions
 def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):

--- a/paddlehub/finetune/evaluator/__init__.py
+++ b/paddlehub/finetune/evaluator/__init__.py
--- a/paddlehub/finetune/evaluator/cmrc2018_evaluate.py
+++ b/paddlehub/finetune/evaluator/cmrc2018_evaluate.py
+# -*- coding: utf-8 -*-
+'''
+Evaluation script for CMRC 2018
+version: v5 - special
+Note:
+v5 - special: Evaluate on SQuAD-style CMRC 2018 Datasets
+v5: formatted output, add usage description
+v4: fixed segmentation issues
+'''
+from __future__ import print_function
+from collections import OrderedDict
+import re
+import json
+import nltk
+import sys
+
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    print("Downloading nltk punkt")
+    nltk.download('punkt')
+
+
+# split Chinese with English
+def mixed_segmentation(in_str, rm_punc=False):
+    in_str = str(in_str).lower().strip()
+    segs_out = []
+    temp_str = ""
+    sp_char = [
+        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
+        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
+        '）', '－', '～', '『', '』'
+    ]
+    for char in in_str:
+        if rm_punc and char in sp_char:
+            continue
+        if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
+            if temp_str != "":
+                ss = nltk.word_tokenize(temp_str)
+                segs_out.extend(ss)
+                temp_str = ""
+            segs_out.append(char)
+        else:
+            temp_str += char
+
+    # handling last part
+    if temp_str != "":
+        ss = nltk.word_tokenize(temp_str)
+        segs_out.extend(ss)
+
+    return segs_out
+
+
+# remove punctuation
+def remove_punctuation(in_str):
+    in_str = str(in_str).lower().strip()
+    sp_char = [
+        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
+        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
+        '）', '－', '～', '『', '』'
+    ]
+    out_segs = []
+    for char in in_str:
+        if char in sp_char:
+            continue
+        else:
+            out_segs.append(char)
+    return ''.join(out_segs)
+
+
+# find longest common string
+def find_lcs(s1, s2):
+    m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
+    mmax = 0
+    p = 0
+    for i in range(len(s1)):
+        for j in range(len(s2)):
+            if s1[i] == s2[j]:
+                m[i + 1][j + 1] = m[i][j] + 1
+                if m[i + 1][j + 1] > mmax:
+                    mmax = m[i + 1][j + 1]
+                    p = i + 1
+    return s1[p - mmax:p], mmax
+
+
+def evaluate(ground_truth_file, prediction_file):
+    f1 = 0
+    em = 0
+    total_count = 0
+    skip_count = 0
+    for instance in ground_truth_file:
+        # context_id   = instance['context_id'].strip()
+        # context_text = instance['context_text'].strip()
+        for para in instance["paragraphs"]:
+            for qas in para['qas']:
+                total_count += 1
+                query_id = qas['id'].strip()
+                query_text = qas['question'].strip()
+                answers = [x["text"] for x in qas['answers']]
+
+                if query_id not in prediction_file:
+                    print('Unanswered question: {}\n'.format(query_id))
+                    skip_count += 1
+                    continue
+
+                prediction = str(prediction_file[query_id])
+                f1 += calc_f1_score(answers, prediction)
+                em += calc_em_score(answers, prediction)
+
+    f1_score = 100.0 * f1 / total_count
+    em_score = 100.0 * em / total_count
+    return f1_score, em_score, total_count, skip_count
+
+
+def calc_f1_score(answers, prediction):
+    f1_scores = []
+    for ans in answers:
+        ans_segs = mixed_segmentation(ans, rm_punc=True)
+        prediction_segs = mixed_segmentation(prediction, rm_punc=True)
+        lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
+        if lcs_len == 0:
+            f1_scores.append(0)
+            continue
+        precision = 1.0 * lcs_len / len(prediction_segs)
+        recall = 1.0 * lcs_len / len(ans_segs)
+        f1 = (2 * precision * recall) / (precision + recall)
+        f1_scores.append(f1)
+    return max(f1_scores)
+
+
+def calc_em_score(answers, prediction):
+    em = 0
+    for ans in answers:
+        ans_ = remove_punctuation(ans)
+        prediction_ = remove_punctuation(prediction)
+        if ans_ == prediction_:
+            em = 1
+            break
+    return em
+
+
+def get_eval(original_file, prediction_file):
+    F1, EM, TOTAL, SKIP = evaluate(original_file, prediction_file)
+    AVG = (EM + F1) * 0.5
+    output_result = OrderedDict()
+    output_result['AVERAGE'] = AVG
+    output_result['F1'] = F1
+    output_result['EM'] = EM
+    output_result['TOTAL'] = TOTAL
+    output_result['SKIP'] = SKIP
+
+    return output_result
--- a/demo/reading-comprehension/evaluate_v1.py
+++ b/demo/reading-comprehension/evaluate_v1.py
--- a/demo/reading-comprehension/evaluate_v2.py
+++ b/demo/reading-comprehension/evaluate_v2.py
@@ -138,6 +138,7 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs,
    main_eval['best_exact_thresh'] = exact_thresh
    main_eval['best_f1'] = best_f1
    main_eval['best_f1_thresh'] = f1_thresh
+    return main_eval


 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
@@ -161,3 +162,28 @@ def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
            best_score = cur_score
            best_thresh = na_probs[qid]
    return 100.0 * best_score / len(scores), best_thresh
+
+
+def evaluate(dataset, predictions, na_probs):
+    qid_to_has_ans = make_qid_to_has_ans(dataset)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(dataset, predictions)
+    exact_thresh = apply_no_ans_threshold(
+        exact_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
+    f1_thresh = apply_no_ans_threshold(
+        f1_raw, na_probs, qid_to_has_ans, na_prob_thresh=1.0)
+    out_eval = make_eval_dict(exact_thresh, f1_thresh)
+
+    if has_ans_qids:
+        has_ans_eval = make_eval_dict(
+            exact_thresh, f1_thresh, qid_list=has_ans_qids)
+        merge_eval(out_eval, has_ans_eval, 'HasAns')
+    if no_ans_qids:
+        no_ans_eval = make_eval_dict(
+            exact_thresh, f1_thresh, qid_list=no_ans_qids)
+        merge_eval(out_eval, no_ans_eval, 'NoAns')
+
+    find_all_best_thresh(out_eval, predictions, exact_raw, f1_raw, na_probs,
+                         qid_to_has_ans)
+    return out_eval
--- a/paddlehub/finetune/strategy.py
+++ b/paddlehub/finetune/strategy.py
@@ -414,11 +414,7 @@ class CombinedStrategy(DefaultStrategy):
        # self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
        data_reader.data_generator(
            batch_size=config.batch_size, phase='train', shuffle=True)
-        data_reader.data_generator(
-            batch_size=config.batch_size, phase='dev', shuffle=False)
-        data_reader.data_generator(
-            batch_size=config.batch_size, phase='test', shuffle=False)
-        num_train_examples = len(data_reader.get_train_examples())
+        num_train_examples = data_reader.num_examples['train']

        max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count


--- a/paddlehub/finetune/task/basic_task.py
+++ b/paddlehub/finetune/task/basic_task.py
@@ -165,6 +165,10 @@ class BasicTask(object):
    def enter_phase(self, phase):
        if phase not in ["train", "val", "dev", "test", "predict", "inference"]:
            raise RuntimeError()
+        if phase in ["val", "dev"]:
+            phase = "dev"
+        elif phase in ["predict", "inference"]:
+            phase = "predict"
        self._phases.append(phase)

    def exit_phase(self):
@@ -330,7 +334,7 @@ class BasicTask(object):
    def env(self):
        phase = self.phase
        if phase in ["val", "dev", "test"]:
-            phase = "val"
+            phase = "dev"
        if not phase in self._envs:
            self._envs[phase] = RunEnv()
        return self._envs[phase]
@@ -468,6 +472,7 @@ class BasicTask(object):

    def _eval_end_event(self, run_states):
        eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)
+        if 'train' in self._envs:
            self.tb_writer.add_scalar(
                tag="Loss_{}".format(self.phase),
                scalar_value=eval_loss,
@@ -475,11 +480,11 @@ class BasicTask(object):

        log_scores = ""
        for metric in eval_scores:
+            if 'train' in self._envs:
                self.tb_writer.add_scalar(
                    tag="{}_{}".format(metric, self.phase),
                    scalar_value=eval_scores[metric],
                    global_step=self._envs['train'].current_step)
-
            log_scores += "%s=%.5f " % (metric, eval_scores[metric])
        logger.info(
            "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" %
@@ -501,6 +506,7 @@ class BasicTask(object):
                                           "best_model")
            logger.info("best model saved to %s [best %s=%.5f]" %
                        (model_saved_dir, main_metric, main_value))
+
            save_result = fluid.io.save_persistables(
                executor=self.exe,
                dirname=model_saved_dir,

--- a/paddlehub/finetune/task/reading_comprehension_task.py
+++ b/paddlehub/finetune/task/reading_comprehension_task.py
@@ -18,11 +18,370 @@ from __future__ import division
 from __future__ import print_function

 import time
+import os
+import collections
+import math
+import six
+import json
+
 from collections import OrderedDict

 import numpy as np
 import paddle.fluid as fluid
 from .basic_task import BasicTask
+from paddlehub.common.logger import logger
+from paddlehub.reader import tokenization
+from paddlehub.finetune.evaluator import squad1_evaluate
+from paddlehub.finetune.evaluator import squad2_evaluate
+from paddlehub.finetune.evaluator import cmrc2018_evaluate
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, is_english):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    if is_english:
+        tok_text = " ".join(tokenizer.tokenize(orig_text))
+    else:
+        tok_text = "".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        # using in debug
+        # logger.info(
+        #     "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        # using in debug
+        # logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+        #                 orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        # using in debug
+        # logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        # using in debug
+        # tf.logging.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      version_2_with_negative, null_score_diff_threshold,
+                      is_english):
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple("PrelimPrediction", [
+        "feature_index", "start_index", "end_index", "start_logit", "end_logit"
+    ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            if feature.unique_id not in unique_id_to_result:
+                logger.info(
+                    "As using pyreader, the last one batch is so small that the feature %s in the last batch is discarded "
+                    % feature.unique_id)
+                continue
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        if not prelim_predictions:
+            logger.warning(("not prelim_predictions:", example.qas_id))
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(
+                    pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(
+                    orig_doc_end + 1)]
+                if is_english:
+                    tok_text = " ".join(tok_tokens)
+                else:
+                    tok_text = "".join(tok_tokens)
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                if is_english:
+                    orig_text = " ".join(orig_tokens)
+                else:
+                    orig_text = "".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case,
+                                            is_english)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null
+            if best_non_null_entry:
+                score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+    """Write final predictions to the json file and log-odds of null if needed."""
+    with open(output_prediction_file, "w") as writer:
+        logger.info("Writing predictions to: %s" % (output_prediction_file))
+        writer.write(
+            json.dumps(all_predictions, indent=4, ensure_ascii=is_english) +
+            "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        logger.info("Writing nbest to: %s" % (output_nbest_file))
+        writer.write(
+            json.dumps(all_nbest_json, indent=4, ensure_ascii=is_english) +
+            "\n")
+
+    if version_2_with_negative:
+        logger.info("Writing null_log_odds to: %s" % (output_nbest_file))
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(
+                json.dumps(scores_diff_json, indent=4, ensure_ascii=is_english)
+                + "\n")


 class ReadingComprehensionTask(BasicTask):
@@ -32,7 +391,11 @@ class ReadingComprehensionTask(BasicTask):
                 data_reader,
                 startup_program=None,
                 config=None,
-                 metrics_choices=None):
+                 metrics_choices=None,
+                 sub_task="squad",
+                 null_score_diff_threshold=0.0,
+                 n_best_size=20,
+                 max_answer_length=30):

        main_program = feature.block.program
        super(ReadingComprehensionTask, self).__init__(
@@ -43,14 +406,23 @@ class ReadingComprehensionTask(BasicTask):
            config=config,
            metrics_choices=metrics_choices)
        self.feature = feature
+        self.data_reader = data_reader
+        self.sub_task = sub_task.lower()
+        self.version_2_with_negative = (self.sub_task == "squad2.0")
+        if self.sub_task in ["squad2.0", "squad"]:
+            self.is_english = True
+        elif self.sub_task in ["cmrc2018", "drcd"]:
+            self.is_english = False
+        else:
+            raise Exception("No language type of data set is sepecified")
+
+        self.null_score_diff_threshold = null_score_diff_threshold
+        self.n_best_size = n_best_size
+        self.max_answer_length = max_answer_length

    def _build_net(self):
-        if self.is_predict_phase:
-            self.unique_id = fluid.layers.data(
-                name="start_positions",
-                shape=[-1, 1],
-                lod_level=0,
-                dtype="int64")
+        self.unique_ids = fluid.layers.data(
+            name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64")

        logits = fluid.layers.fc(
            input=self.feature,
@@ -100,37 +472,149 @@ class ReadingComprehensionTask(BasicTask):

    @property
    def feed_list(self):
-        feed_list = [varname for varname in self._base_feed_list]
-        if self.is_train_phase:
-            feed_list += [self.labels[0].name, self.labels[1].name]
-        elif self.is_predict_phase:
-            feed_list += [self.unique_id.name]
+        feed_list = [varname for varname in self._base_feed_list
+                     ] + [self.unique_ids.name]
+        if self.is_train_phase or self.is_test_phase:
+            feed_list += [label.name for label in self.labels]
        return feed_list

    @property
    def fetch_list(self):
-        if self.is_train_phase:
-            return [metric.name for metric in self.metrics
-                    ] + [self.loss.name, self.outputs[-1].name]
+        if self.is_train_phase or self.is_test_phase:
+            return [
+                self.loss.name, self.outputs[-1].name, self.unique_ids.name,
+                self.outputs[0].name, self.outputs[1].name
+            ]
        elif self.is_predict_phase:
-            return [self.unique_id.name
+            return [
+                self.unique_ids.name,
            ] + [output.name for output in self.outputs]

    def _calculate_metrics(self, run_states):
-        total_cost, total_num_seqs = [], []
-        run_step = run_time_used = run_examples = 0
+        total_cost, total_num_seqs, all_results = [], [], []
+        run_step = 0
+        RawResult = collections.namedtuple(
+            "RawResult", ["unique_id", "start_logits", "end_logits"])
        for run_state in run_states:
            np_loss = run_state.run_results[0]
            np_num_seqs = run_state.run_results[1]
            total_cost.extend(np_loss * np_num_seqs)
            total_num_seqs.extend(np_num_seqs)
-            run_examples += run_state.run_examples
            run_step += run_state.run_step
+            if self.is_test_phase:
+                np_unique_ids = run_state.run_results[2]
+                np_start_logits = run_state.run_results[3]
+                np_end_logits = run_state.run_results[4]
+                for idx in range(np_unique_ids.shape[0]):
+                    unique_id = int(np_unique_ids[idx])
+                    start_logits = [float(x) for x in np_start_logits[idx].flat]
+                    end_logits = [float(x) for x in np_end_logits[idx].flat]
+                    all_results.append(
+                        RawResult(
+                            unique_id=unique_id,
+                            start_logits=start_logits,
+                            end_logits=end_logits))

        run_time_used = time.time() - run_states[0].run_time_begin
        run_speed = run_step / run_time_used
        avg_loss = np.sum(total_cost) / np.sum(total_num_seqs)
-
        scores = OrderedDict()
        # If none of metrics has been implemented, loss will be used to evaluate.
+        if self.is_test_phase:
+            output_prediction_file = os.path.join(self.config.checkpoint_dir,
+                                                  "predictions.json")
+            output_nbest_file = os.path.join(self.config.checkpoint_dir,
+                                             "nbest_predictions.json")
+            output_null_log_odds_file = os.path.join(self.config.checkpoint_dir,
+                                                     "null_odds.json")
+            all_examples = self.data_reader.all_examples[self.phase]
+            all_features = self.data_reader.all_features[self.phase]
+            write_predictions(
+                all_examples=all_examples,
+                all_features=all_features,
+                all_results=all_results,
+                n_best_size=self.n_best_size,
+                max_answer_length=self.max_answer_length,
+                do_lower_case=True,
+                output_prediction_file=output_prediction_file,
+                output_nbest_file=output_nbest_file,
+                output_null_log_odds_file=output_null_log_odds_file,
+                version_2_with_negative=self.version_2_with_negative,
+                null_score_diff_threshold=self.null_score_diff_threshold,
+                is_english=self.is_english)
+            if self.phase == 'val' or self.phase == 'dev':
+                with open(
+                        self.data_reader.dataset.dev_file, 'r',
+                        encoding="utf8") as dataset_file:
+                    dataset_json = json.load(dataset_file)
+                    dataset = dataset_json['data']
+            elif self.phase == 'test':
+                with open(
+                        self.data_reader.dataset.test_file, 'r',
+                        encoding="utf8") as dataset_file:
+                    dataset_json = json.load(dataset_file)
+                    dataset = dataset_json['data']
+            else:
+                raise Exception("Error phase: %s when runing _calculate_metrics"
+                                % self.phase)
+            with open(
+                    output_prediction_file, 'r',
+                    encoding="utf8") as prediction_file:
+                predictions = json.load(prediction_file)
+
+            if self.sub_task == "squad":
+                scores = squad1_evaluate.evaluate(dataset, predictions)
+            elif self.sub_task == "squad2.0":
+                with open(
+                        output_null_log_odds_file, 'r',
+                        encoding="utf8") as odds_file:
+                    na_probs = json.load(odds_file)
+                scores = squad2_evaluate.evaluate(dataset, predictions,
+                                                  na_probs)
+            elif self.sub_task in ["cmrc2018", "drcd"]:
+                scores = cmrc2018_evaluate.get_eval(dataset, predictions)
        return scores, avg_loss, run_speed
+
+    def _predict_end_event(self, run_states):
+        all_results = []
+        RawResult = collections.namedtuple(
+            "RawResult", ["unique_id", "start_logits", "end_logits"])
+        for run_state in run_states:
+            np_unique_ids = run_state.run_results[0]
+            np_start_logits = run_state.run_results[1]
+            np_end_logits = run_state.run_results[2]
+            for idx in range(np_unique_ids.shape[0]):
+                unique_id = int(np_unique_ids[idx])
+                start_logits = [float(x) for x in np_start_logits[idx].flat]
+                end_logits = [float(x) for x in np_end_logits[idx].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+        # If none of metrics has been implemented, loss will be used to evaluate.
+        output_prediction_file = os.path.join(self.config.checkpoint_dir,
+                                              "predict_predictions.json")
+        output_nbest_file = os.path.join(self.config.checkpoint_dir,
+                                         "predict_nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(self.config.checkpoint_dir,
+                                                 "predict_null_odds.json")
+        all_examples = self.data_reader.all_examples[self.phase]
+        all_features = self.data_reader.all_features[self.phase]
+        write_predictions(
+            all_examples=all_examples,
+            all_features=all_features,
+            all_results=all_results,
+            n_best_size=self.n_best_size,
+            max_answer_length=self.max_answer_length,
+            do_lower_case=True,
+            output_prediction_file=output_prediction_file,
+            output_nbest_file=output_nbest_file,
+            output_null_log_odds_file=output_null_log_odds_file,
+            version_2_with_negative=self.version_2_with_negative,
+            null_score_diff_threshold=self.null_score_diff_threshold,
+            is_english=self.is_english)
+
+        logger.info("PaddleHub predict finished.")
+
+        logger.info("You can see the prediction in %s" % output_prediction_file)
--- a/paddlehub/module/module.py
+++ b/paddlehub/module/module.py
@@ -583,7 +583,7 @@ class Module(object):
            if max_seq_len > MAX_SEQ_LENGTH or max_seq_len <= 0:
                raise ValueError(
                    "max_seq_len({}) should be in the range of [1, {}]".format(
-                        MAX_SEQ_LENGTH))
+                        max_seq_len, MAX_SEQ_LENGTH))
            logger.info(
                "Set maximum sequence length of input tensor to {}".format(
                    max_seq_len))

--- a/paddlehub/reader/nlp_reader.py
+++ b/paddlehub/reader/nlp_reader.py
@@ -735,36 +735,6 @@ class MultiLabelClassifyReader(BaseReader):
        return record


-class SquadInputFeatures(object):
-    """A single set of features of squad_data."""
-
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
 class RegressionReader(BaseReader):
    def __init__(self,
                 vocab_path,
@@ -909,26 +879,75 @@ class RegressionReader(BaseReader):
        return wrapper


-class ReadingComprehensionReader(object):
+class Features(object):
+    """A single set of features of squad_data."""
+
+    def __init__(
+            self,
+            unique_id,
+            example_index,
+            doc_span_index,
+            tokens,
+            token_to_orig_map,
+            token_is_max_context,
+            token_ids,
+            position_ids,
+            text_type_ids,
+            start_position=None,
+            end_position=None,
+            is_impossible=None,
+    ):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.token_ids = token_ids
+        self.position_ids = position_ids
+        self.text_type_ids = text_type_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __repr__(self):
+        s = ""
+        s += "unique_id: %s " % self.unique_id
+        s += "example_index: %s " % self.example_index
+        s += "start_position: %s " % self.start_position
+        s += "end_position: %s " % self.end_position
+        s += "is_impossible: %s " % self.is_impossible
+        # s += "tokens: %s" % self.tokens
+        # s += "token_to_orig_map %s" % self.token_to_orig_map
+        return s
+
+
+class ReadingComprehensionReader(BaseReader):
    def __init__(self,
                 dataset,
                 vocab_path,
                 do_lower_case=True,
-                 max_seq_length=512,
+                 max_seq_len=512,
                 doc_stride=128,
                 max_query_length=64,
-                 random_seed=None):
+                 random_seed=None,
+                 use_task_id=False):
        self.dataset = dataset
-        self._tokenizer = tokenization.FullTokenizer(
+        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self._max_seq_length = max_seq_length
-        self._doc_stride = doc_stride
-        self._max_query_length = max_query_length
-        self._in_tokens = False
+        self.max_seq_len = max_seq_len
+        self.doc_stride = doc_stride
+        self.max_query_length = max_query_length
+        self.use_task_id = use_task_id
+        self.in_tokens = False
+        # self.all_examples[phase] and self.all_features[phase] will be used
+        # in write_prediction in reading_comprehension_task
+        self.all_features = {"train": [], "dev": [], "test": [], "predict": []}
+        self.all_examples = {"train": [], "dev": [], "test": [], "predict": []}

        np.random.seed(random_seed)

-        self.vocab = self._tokenizer.vocab
+        self.vocab = self.tokenizer.vocab
        self.vocab_size = len(self.vocab)
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
@@ -939,139 +958,172 @@ class ReadingComprehensionReader(object):

        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}

-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_train_example
+    def _pad_batch_records(self, batch_records, phase):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_unique_ids = [record.unique_id for record in batch_records]
+        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape(
+            [-1, 1])

-    def get_train_examples(self):
-        """Gets a collection of `SquadExample`s for the train set."""
-        return self.dataset.get_train_examples()
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            max_seq_len=self.max_seq_len)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids,
+            pad_idx=self.pad_id,
+            max_seq_len=self.max_seq_len)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids,
+            pad_idx=self.pad_id,
+            max_seq_len=self.max_seq_len)

-    def get_dev_examples(self):
-        """Gets a collection of `SquadExample`s for the dev set."""
-        return self.dataset.get_dev_examples()
+        if phase != "predict":
+            batch_start_position = [
+                record.start_position for record in batch_records
+            ]
+            batch_end_position = [
+                record.end_position for record in batch_records
+            ]
+            batch_start_position = np.array(batch_start_position).astype(
+                "int64").reshape([-1, 1])
+            batch_end_position = np.array(batch_end_position).astype(
+                "int64").reshape([-1, 1])

-    def get_test_examples(self):
-        """Gets a collection of `SquadExample`s for prediction."""
-        return self.dataset.get_test_examples()
+            return_list = [
+                padded_token_ids, padded_position_ids, padded_text_type_ids,
+                input_mask, batch_unique_ids, batch_start_position,
+                batch_end_position
+            ]

-    def get_num_examples(self, phase):
-        if phase not in ['train', 'dev', 'test']:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'predict'].")
-        return self.num_examples[phase]
+            if self.use_task_id:
+                padded_task_ids = np.ones_like(
+                    padded_token_ids, dtype="int64") * self.task_id
+                return_list = [
+                    padded_token_ids, padded_position_ids, padded_text_type_ids,
+                    input_mask, padded_task_ids, batch_unique_ids,
+                    batch_start_position, batch_end_position
+                ]
+
+        else:
+            return_list = [
+                padded_token_ids, padded_position_ids, padded_text_type_ids,
+                input_mask, batch_unique_ids
+            ]
+            if self.use_task_id:
+                padded_task_ids = np.ones_like(
+                    padded_token_ids, dtype="int64") * self.task_id
+                return_list = [
+                    padded_token_ids, padded_position_ids, padded_text_type_ids,
+                    input_mask, padded_task_ids, batch_unique_ids
+                ]
+        return return_list
+
+    def _prepare_batch_data(self, records, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        for index, record in enumerate(records):
+            if phase == "train":
+                self.current_example = index
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records, phase)
+                batch_records, max_len = [record], len(record.token_ids)
+
+        if batch_records:
+            yield self._pad_batch_records(batch_records, phase)

    def data_generator(self,
                       batch_size=1,
                       phase='train',
                       shuffle=False,
                       data=None):
+        # we need all_examples and  all_features in write_prediction in reading_comprehension_task
+        # we can also use all_examples and all_features to avoid duplicate long-time preprocessing
+        examples = None
+        if self.all_examples[phase]:
+            examples = self.all_examples[phase]
+        else:
            if phase == 'train':
-            shuffle = True
                examples = self.get_train_examples()
-            self.num_examples['train'] = len(examples)
            elif phase == 'dev':
-            shuffle = False
                examples = self.get_dev_examples()
-            self.num_examples['dev'] = len(examples)
            elif phase == 'test':
-            shuffle = False
                examples = self.get_test_examples()
-            self.num_examples['test'] = len(examples)
            elif phase == 'predict':
-            shuffle = False
                examples = data
            else:
                raise ValueError(
                    "Unknown phase, which should be in ['train', 'dev', 'test', 'predict']."
                )
-
-        def batch_reader(features, batch_size, in_tokens):
-            batch, total_token_num, max_len = [], 0, 0
-            for (index, feature) in enumerate(features):
-                if phase == 'train':
-                    self.current_train_example = index + 1
-                seq_len = len(feature.input_ids)
-                labels = [feature.unique_id
-                          ] if feature.start_position is None else [
-                              feature.start_position, feature.end_position
-                          ]
-                example = [
-                    feature.input_ids, feature.segment_ids,
-                    range(seq_len)
-                ] + labels
-                max_len = max(max_len, seq_len)
-
-                #max_len = max(max_len, len(token_ids))
-                if in_tokens:
-                    to_append = (len(batch) + 1) * max_len <= batch_size
+            self.all_examples[phase] = examples
+        shuffle = True if phase == 'train' else False
+
+        # As reading comprehension task will divide a long context into several doc_spans and then get multiple features
+        # To get the real total steps, we need to know the features' length
+        # So we use _convert_examples_to_records rather than _convert_example_to_record in this task
+        if self.all_features[phase]:
+            features = self.all_features[phase]
        else:
-                    to_append = len(batch) < batch_size
+            features = self._convert_examples_to_records(
+                examples, self.max_seq_len, self.tokenizer, phase)
+            self.all_features[phase] = features

-                if to_append:
-                    batch.append(example)
-                    total_token_num += seq_len
-                else:
-                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [example
-                                                       ], seq_len, seq_len
-            if len(batch) > 0:
-                yield batch, total_token_num
+        # self.num_examples["train"] use in strategy.py to show the total steps,
+        # we need to cover it with correct len(features)
+        self.num_examples[phase] = len(features)

        def wrapper():
            if shuffle:
-                np.random.shuffle(examples)
-            if phase == "train":
-                features = self.convert_examples_to_features(
-                    examples, is_training=True)
-            else:
-                features = self.convert_examples_to_features(
-                    examples, is_training=False)
-
-            for batch_data, total_token_num in batch_reader(
-                    features, batch_size, self._in_tokens):
-                batch_data = prepare_batch_data(
-                    batch_data,
-                    total_token_num,
-                    self._max_seq_length,
-                    pad_id=self.pad_id,
-                    cls_id=self.cls_id,
-                    sep_id=self.sep_id,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
+                np.random.shuffle(features)

+            for batch_data in self._prepare_batch_data(
+                    features, batch_size, phase=phase):
                yield [batch_data]

        return wrapper

-    def convert_examples_to_features(self, examples, is_training):
+    def _convert_examples_to_records(self,
+                                     examples,
+                                     max_seq_length,
+                                     tokenizer,
+                                     phase=None):
        """Loads a data file into a list of `InputBatch`s."""
-
+        features = []
        unique_id = 1000000000

        for (example_index, example) in enumerate(examples):
-            query_tokens = self._tokenizer.tokenize(example.question_text)
-
-            if len(query_tokens) > self._max_query_length:
-                query_tokens = query_tokens[0:self._max_query_length]
-
+            query_tokens = tokenizer.tokenize(example.question_text)
+            if len(query_tokens) > self.max_query_length:
+                query_tokens = query_tokens[0:self.max_query_length]
            tok_to_orig_index = []
            orig_to_tok_index = []
            all_doc_tokens = []
            for (i, token) in enumerate(example.doc_tokens):
                orig_to_tok_index.append(len(all_doc_tokens))
-                sub_tokens = self._tokenizer.tokenize(token)
+                sub_tokens = tokenizer.tokenize(token)
                for sub_token in sub_tokens:
                    tok_to_orig_index.append(i)
                    all_doc_tokens.append(sub_token)

            tok_start_position = None
            tok_end_position = None
-            if is_training and example.is_impossible:
+            is_impossible = example.is_impossible if hasattr(
+                example, "is_impossible") else False
+
+            if phase != "predict" and is_impossible:
                tok_start_position = -1
                tok_end_position = -1
-            if is_training and not example.is_impossible:
+            if phase != "predict" and not is_impossible:
                tok_start_position = orig_to_tok_index[example.start_position]
                if example.end_position < len(example.doc_tokens) - 1:
                    tok_end_position = orig_to_tok_index[example.end_position +
@@ -1081,10 +1133,10 @@ class ReadingComprehensionReader(object):
                (tok_start_position,
                 tok_end_position) = self.improve_answer_span(
                     all_doc_tokens, tok_start_position, tok_end_position,
-                     self._tokenizer, example.orig_answer_text)
+                     tokenizer, example.orig_answer_text)

            # The -3 accounts for [CLS], [SEP] and [SEP]
-            max_tokens_for_doc = self._max_seq_length - len(query_tokens) - 3
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

            # We can have documents that are longer than the maximum sequence length.
            # To deal with this we do a sliding window approach, where we take chunks
@@ -1099,20 +1151,20 @@ class ReadingComprehensionReader(object):
                doc_spans.append(_DocSpan(start=start_offset, length=length))
                if start_offset + length == len(all_doc_tokens):
                    break
-                start_offset += min(length, self._doc_stride)
+                start_offset += min(length, self.doc_stride)

            for (doc_span_index, doc_span) in enumerate(doc_spans):
                tokens = []
                token_to_orig_map = {}
                token_is_max_context = {}
-                segment_ids = []
+                text_type_ids = []
                tokens.append("[CLS]")
-                segment_ids.append(0)
+                text_type_ids.append(0)
                for token in query_tokens:
                    tokens.append(token)
-                    segment_ids.append(0)
+                    text_type_ids.append(0)
                tokens.append("[SEP]")
-                segment_ids.append(0)
+                text_type_ids.append(0)

                for i in range(doc_span.length):
                    split_token_index = doc_span.start + i
@@ -1123,29 +1175,15 @@ class ReadingComprehensionReader(object):
                        doc_spans, doc_span_index, split_token_index)
                    token_is_max_context[len(tokens)] = is_max_context
                    tokens.append(all_doc_tokens[split_token_index])
-                    segment_ids.append(1)
+                    text_type_ids.append(1)
                tokens.append("[SEP]")
-                segment_ids.append(1)
-
-                input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
-
-                # The mask has 1 for real tokens and 0 for padding tokens. Only real
-                # tokens are attended to.
-                input_mask = [1] * len(input_ids)
-
-                # Zero-pad up to the sequence length.
-                #while len(input_ids) < max_seq_length:
-                #  input_ids.append(0)
-                #  input_mask.append(0)
-                #  segment_ids.append(0)
-
-                #assert len(input_ids) == max_seq_length
-                #assert len(input_mask) == max_seq_length
-                #assert len(segment_ids) == max_seq_length
+                text_type_ids.append(1)

+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                position_ids = list(range(len(token_ids)))
                start_position = None
                end_position = None
-                if is_training and not example.is_impossible:
+                if phase != "predict" and not is_impossible:
                    # For training, if our document chunk does not contain an annotation
                    # we throw it out, since there is nothing to predict.
                    doc_start = doc_span.start
@@ -1162,58 +1200,28 @@ class ReadingComprehensionReader(object):
                        start_position = tok_start_position - doc_start + doc_offset
                        end_position = tok_end_position - doc_start + doc_offset

-                if is_training and example.is_impossible:
+                if phase != "predict" and is_impossible:
                    start_position = 0
                    end_position = 0

-                if example_index < 3:
-                    logger.debug("*** Example ***")
-                    logger.debug("unique_id: %s" % (unique_id))
-                    logger.debug("example_index: %s" % (example_index))
-                    logger.debug("doc_span_index: %s" % (doc_span_index))
-                    logger.debug("tokens: %s" % " ".join(
-                        [tokenization.printable_text(x) for x in tokens]))
-                    logger.debug("token_to_orig_map: %s" % " ".join([
-                        "%d:%d" % (x, y)
-                        for (x, y) in six.iteritems(token_to_orig_map)
-                    ]))
-                    logger.debug("token_is_max_context: %s" % " ".join([
-                        "%d:%s" % (x, y)
-                        for (x, y) in six.iteritems(token_is_max_context)
-                    ]))
-                    logger.debug(
-                        "input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                    logger.debug("input_mask: %s" % " ".join(
-                        [str(x) for x in input_mask]))
-                    logger.debug("segment_ids: %s" % " ".join(
-                        [str(x) for x in segment_ids]))
-                    if is_training and example.is_impossible:
-                        logger.debug("impossible example")
-                    if is_training and not example.is_impossible:
-                        answer_text = " ".join(
-                            tokens[start_position:(end_position + 1)])
-                        logger.debug("start_position: %d" % (start_position))
-                        logger.debug("end_position: %d" % (end_position))
-                        logger.debug("answer: %s" %
-                                     (tokenization.printable_text(answer_text)))
-
-                feature = SquadInputFeatures(
+                feature = Features(
                    unique_id=unique_id,
                    example_index=example_index,
                    doc_span_index=doc_span_index,
                    tokens=tokens,
                    token_to_orig_map=token_to_orig_map,
                    token_is_max_context=token_is_max_context,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    segment_ids=segment_ids,
+                    token_ids=token_ids,
+                    position_ids=position_ids,
+                    text_type_ids=text_type_ids,
                    start_position=start_position,
                    end_position=end_position,
-                    is_impossible=example.is_impossible)
+                    is_impossible=is_impossible)
+                features.append(feature)

                unique_id += 1

-                yield feature
+        return features

    def improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer,
                            orig_answer_text):