remove useless file and organize bert reader

44aa7b51 · Zeyu Chen · 4b35d202 · 44aa7b51 · 44aa7b51 · 4b35d202
26 changed file
--- a/README.md
+++ b/README.md
 # PaddleHub
-[![Build Status](https://travis-ci.org/PaddlePaddle/PaddleHub.svg?branch=master)](https://travis-ci.org/PaddlePaddle/PaddleHub)
+[![Build Status](https://travis-ci.org/PaddlePaddle/PaddleHub.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/PaddleHub)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
--- a/demo/bert-cls/finetune_with_hub.py
+++ b/demo/bert-cls/finetune_with_hub.py
@@ -26,56 +26,42 @@ import paddle
 import paddle.fluid as fluid
 import paddle_hub as hub
-import reader.cls as reader
-import reader.task_reader as task_reader
-from utils.args import ArgumentGroup, print_arguments
-from paddle_hub.finetune.config import FinetuneConfig
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
-train_g = ArgumentGroup(parser, "training", "training options.")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
-train_g.add_arg("epoch",             int,    3,       "Number of epoches for fine-tuning.")
+parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory")
-train_g.add_arg("learning_rate",     float,  5e-5,    "Learning rate used to train with warmup.")
+parser.add_argument("--lr_scheduler", type=str, default="linear_warmup_decay",
-train_g.add_arg("hub_module_dir",    str,  None,    "PaddleHub module directory")
+        help="scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
-train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.")
-train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.")
-train_g.add_arg("warmup_proportion", float,  0.1,
+parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
-                "Proportion of training steps to perform linear learning rate warmup for.")
+parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.")
+parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
-data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
-data_g.add_arg("data_dir",      str,  None,  "Path to training data.")
-data_g.add_arg("checkpoint_dir", str,  None,  "Directory to model checkpoint")
-data_g.add_arg("vocab_path",    str,  None,  "Vocabulary path.")
-data_g.add_arg("max_seq_len",   int,  512,   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",    int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
 args = parser.parse_args()
 # yapf: enable.
 if __name__ == '__main__':
-    print_arguments(args)
+    config = hub.FinetuneConfig(
-    config = FinetuneConfig(
        log_interval=10,
        eval_interval=100,
-        save_ckpt_interval=50,
+        save_ckpt_interval=200,
-        use_cuda=True,
        checkpoint_dir=args.checkpoint_dir,
        learning_rate=args.learning_rate,
-        num_epoch=args.epoch,
+        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        max_seq_len=args.max_seq_len,
        weight_decay=args.weight_decay,
-        finetune_strategy="bert_finetune",
+        finetune_strategy="bert_finetune")
-        enable_memory_optim=True,
-        optimizer=None,
-        warmup_proportion=args.warmup_proportion)
    # loading Paddlehub BERT
    module = hub.Module(module_dir=args.hub_module_dir)
-    reader = reader.BERTClassifyReader(
+    # Use BERTTokenizeReader to tokenize the dataset according to model's
-        data_dir=args.data_dir,
+    # vocabulary
+    reader = hub.reader.BERTTokenizeReader(
+        dataset=hub.dataset.ChnSentiCorp(),  # download chnsenticorp dataset
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len)

--- a/demo/bert-cls/reader/__init__.py
+++ b/demo/bert-cls/reader/__init__.py
--- a/demo/bert-cls/reader/squad.py
+++ b/demo/bert-cls/reader/squad.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
-import six
-import math
-import json
-import random
-import collections
-import tokenization
-from batching import prepare_batch_data
-class SquadExample(object):
-    """A single training/test example for simple sequence classification.
-     For examples without an answer, the start and end position are -1.
-  """
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=False):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-    def __str__(self):
-        return self.__repr__()
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-        s += ", question_text: %s" % (tokenization.printable_text(
-            self.question_text))
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.start_position:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-class InputFeatures(object):
-    """A single set of features of data."""
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-def read_squad_examples(input_file, is_training, version_2_with_negative=False):
-    """Read a SQuAD json file into a list of SquadExample."""
-    with open(input_file, "r") as reader:
-        input_data = json.load(reader)["data"]
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if version_2_with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer."
-                        )
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset +
-                                                           answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(
-                            doc_tokens[start_position:(end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            tokenization.whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            print("Could not find answer: '%s' vs. '%s'",
-                                  actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-                example = SquadExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-def convert_examples_to_features(
-        examples,
-        tokenizer,
-        max_seq_length,
-        doc_stride,
-        max_query_length,
-        is_training,
-        #output_fn
-):
-    """Loads a data file into a list of `InputBatch`s."""
-    unique_id = 1000000000
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.tokenize(example.question_text)
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position +
-                                                     1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(
-                    tokens)] = tok_to_orig_index[split_token_index]
-                is_max_context = _check_is_max_context(
-                    doc_spans, doc_span_index, split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-            # Zero-pad up to the sequence length.
-            #while len(input_ids) < max_seq_length:
-            #  input_ids.append(0)
-            #  input_mask.append(0)
-            #  segment_ids.append(0)
-            #assert len(input_ids) == max_seq_length
-            #assert len(input_mask) == max_seq_length
-            #assert len(segment_ids) == max_seq_length
-            start_position = None
-            end_position = None
-            if is_training and not example.is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start
-                        and tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
-            if example_index < 3:
-                print("*** Example ***")
-                print("unique_id: %s" % (unique_id))
-                print("example_index: %s" % (example_index))
-                print("doc_span_index: %s" % (doc_span_index))
-                print("tokens: %s" % " ".join(
-                    [tokenization.printable_text(x) for x in tokens]))
-                print("token_to_orig_map: %s" % " ".join([
-                    "%d:%d" % (x, y)
-                    for (x, y) in six.iteritems(token_to_orig_map)
-                ]))
-                print("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y)
-                    for (x, y) in six.iteritems(token_is_max_context)
-                ]))
-                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                print(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
-                    print("impossible example")
-                if is_training and not example.is_impossible:
-                    answer_text = " ".join(
-                        tokens[start_position:(end_position + 1)])
-                    print("start_position: %d" % (start_position))
-                    print("end_position: %d" % (end_position))
-                    print("answer: %s" %
-                          (tokenization.printable_text(answer_text)))
-            feature = InputFeatures(
-                unique_id=unique_id,
-                example_index=example_index,
-                doc_span_index=doc_span_index,
-                tokens=tokens,
-                token_to_orig_map=token_to_orig_map,
-                token_is_max_context=token_is_max_context,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                segment_ids=segment_ids,
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=example.is_impossible)
-            unique_id += 1
-            yield feature
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    # The SQuAD annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in SQuAD, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-    return (input_start, input_end)
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context,
-                    num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-    return cur_span_index == best_span_index
-class DataProcessor(object):
-    def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
-                 doc_stride, max_query_length):
-        self._tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self._max_seq_length = max_seq_length
-        self._doc_stride = doc_stride
-        self._max_query_length = max_query_length
-        self._in_tokens = in_tokens
-        self.vocab = self._tokenizer.vocab
-        self.vocab_size = len(self.vocab)
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.mask_id = self.vocab["[MASK]"]
-        self.current_train_example = -1
-        self.num_train_examples = -1
-        self.current_train_epoch = -1
-        self.train_examples = None
-        self.predict_examples = None
-        self.num_examples = {'train': -1, 'predict': -1}
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_train_example, self.current_train_epoch
-    def get_examples(self,
-                     data_path,
-                     is_training,
-                     version_2_with_negative=False):
-        examples = read_squad_examples(
-            input_file=data_path,
-            is_training=is_training,
-            version_2_with_negative=version_2_with_negative)
-        return examples
-    def get_num_examples(self, phase):
-        if phase not in ['train', 'predict']:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'predict'].")
-        return self.num_examples[phase]
-    def get_features(self, examples, is_training):
-        features = convert_examples_to_features(
-            examples=examples,
-            tokenizer=self._tokenizer,
-            max_seq_length=self._max_seq_length,
-            doc_stride=self._doc_stride,
-            max_query_length=self._max_query_length,
-            is_training=is_training)
-        return features
-    def data_generator(self,
-                       data_path,
-                       batch_size,
-                       phase='train',
-                       shuffle=False,
-                       version_2_with_negative=False,
-                       epoch=1):
-        if phase == 'train':
-            self.train_examples = self.get_examples(
-                data_path,
-                is_training=True,
-                version_2_with_negative=version_2_with_negative)
-            examples = self.train_examples
-            self.num_examples['train'] = len(self.train_examples)
-        elif phase == 'predict':
-            self.predict_examples = self.get_examples(
-                data_path,
-                is_training=False,
-                version_2_with_negative=version_2_with_negative)
-            examples = self.predict_examples
-            self.num_examples['predict'] = len(self.predict_examples)
-        else:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'predict'].")
-        def batch_reader(features, batch_size, in_tokens):
-            batch, total_token_num, max_len = [], 0, 0
-            for (index, feature) in enumerate(features):
-                if phase == 'train':
-                    self.current_train_example = index + 1
-                seq_len = len(feature.input_ids)
-                labels = [feature.unique_id
-                          ] if feature.start_position is None else [
-                              feature.start_position, feature.end_position
-                          ]
-                example = [
-                    feature.input_ids, feature.segment_ids,
-                    range(seq_len)
-                ] + labels
-                max_len = max(max_len, seq_len)
-                #max_len = max(max_len, len(token_ids))
-                if in_tokens:
-                    to_append = (len(batch) + 1) * max_len <= batch_size
-                else:
-                    to_append = len(batch) < batch_size
-                if to_append:
-                    batch.append(example)
-                    total_token_num += seq_len
-                else:
-                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [example
-                                                       ], seq_len, seq_len
-            if len(batch) > 0:
-                yield batch, total_token_num
-        def wrapper():
-            for epoch_index in range(epoch):
-                if shuffle:
-                    random.shuffle(examples)
-                if phase == 'train':
-                    self.current_train_epoch = epoch_index
-                    features = self.get_features(examples, is_training=True)
-                else:
-                    features = self.get_features(examples, is_training=False)
-                for batch_data, total_token_num in batch_reader(
-                        features, batch_size, self._in_tokens):
-                    yield prepare_batch_data(
-                        batch_data,
-                        total_token_num,
-                        voc_size=-1,
-                        pad_id=self.pad_id,
-                        cls_id=self.cls_id,
-                        sep_id=self.sep_id,
-                        mask_id=-1,
-                        return_input_mask=True,
-                        return_max_len=False,
-                        return_num_token=False)
-        return wrapper
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file, output_null_log_odds_file,
-                      version_2_with_negative, null_score_diff_threshold,
-                      verbose):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    print("Writing predictions to: %s" % (output_prediction_file))
-    print("Writing nbest to: %s" % (output_nbest_file))
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", [
-            "feature_index", "start_index", "end_index", "start_logit",
-            "end_logit"
-        ])
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min mull score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[
-                    0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit))
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index:(
-                    pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start:(
-                    orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)
-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-                final_text = get_final_text(tok_text, orig_text, do_lower_case,
-                                            verbose)
-                if final_text in seen_predictions:
-                    continue
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-            nbest.append(
-                _NbestPrediction(
-                    text=final_text,
-                    start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
-        # if we didn't inlude the empty option in the n-best, inlcude it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="",
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-        assert len(nbest) >= 1
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-        # debug
-        if best_non_null_entry is None:
-            print("Emmm..., sth wrong")
-        probs = _compute_softmax(total_scores)
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-        assert len(nbest_json) >= 1
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (
-                best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-def get_final_text(pred_text, orig_text, do_lower_case, verbose):
-    """Project the tokenized prediction back to the original text."""
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose:
-            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose:
-            print("Length not equal after stripping spaces: '%s' vs '%s'",
-                  orig_ns_text, tok_ns_text)
-        return orig_text
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-    if orig_start_position is None:
-        if verbose:
-            print("Couldn't map start position")
-        return orig_text
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-    if orig_end_position is None:
-        if verbose:
-            print("Couldn't map end position")
-        return orig_text
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-if __name__ == '__main__':
-    train_file = 'squad/train-v1.1.json'
-    vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt'
-    do_lower_case = True
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=vocab_file, do_lower_case=do_lower_case)
-    train_examples = read_squad_examples(
-        input_file=train_file, is_training=True)
-    print("begin converting")
-    for (index, feature) in enumerate(
-            convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=384,
-                doc_stride=128,
-                max_query_length=64,
-                is_training=True,
-                #output_fn=train_writer.process_feature
-            )):
-        if index < 10:
-            print(index, feature.input_ids, feature.input_mask,
-                  feature.segment_ids)
-    #for (index, example) in enumerate(train_examples):
-    #    if index < 5:
-    #        print(example)
--- a/demo/bert-cls/run_fintune_with_hub.sh
+++ b/demo/bert-cls/run_fintune_with_hub.sh
@@ -8,12 +8,11 @@ HUB_MODULE_DIR="./hub_module/bert_chinese_L-12_H-768_A-12.hub_module"
 CKPT_DIR="./ckpt"
 #rm -rf $CKPT_DIR
 python -u finetune_with_hub.py \
-                   --batch_size 128 \
+                   --batch_size 32 \
                   --hub_module_dir=$HUB_MODULE_DIR \
                   --data_dir ${DATA_PATH} \
                   --weight_decay  0.01 \
                   --checkpoint_dir $CKPT_DIR \
-                   --warmup_proportion 0.0 \
+                   --num_epoch 3 \
-                   --epoch 2 \
+                   --max_seq_len 128 \
-                   --max_seq_len 16 \
                   --learning_rate 5e-5
--- a/demo/bert-cls/utils/__init__.py
+++ b/demo/bert-cls/utils/__init__.py
--- a/demo/bert-cls/utils/args.py
+++ b/demo/bert-cls/utils/args.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Arguments for configuration."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import six
-import argparse
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(six.iteritems(vars(args))):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
--- a/demo/bert-cls/utils/fp16.py
+++ b/demo/bert-cls/utils/fp16.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import paddle
-import paddle.fluid as fluid
-def cast_fp16_to_fp32(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP16,
-            "out_dtype": fluid.core.VarDesc.VarType.FP32
-        })
-def cast_fp32_to_fp16(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP32,
-            "out_dtype": fluid.core.VarDesc.VarType.FP16
-        })
-def copy_to_master_param(p, block):
-    v = block.vars.get(p.name, None)
-    if v is None:
-        raise ValueError("no param name %s found!" % p.name)
-    new_p = fluid.framework.Parameter(
-        block=block,
-        shape=v.shape,
-        dtype=fluid.core.VarDesc.VarType.FP32,
-        type=v.type,
-        lod_level=v.lod_level,
-        stop_gradient=p.stop_gradient,
-        trainable=p.trainable,
-        optimize_attr=p.optimize_attr,
-        regularizer=p.regularizer,
-        gradient_clip_attr=p.gradient_clip_attr,
-        error_clip=p.error_clip,
-        name=v.name + ".master")
-    return new_p
-def create_master_params_grads(params_grads, main_prog, startup_prog,
-                               loss_scaling):
-    master_params_grads = []
-    tmp_role = main_prog._current_role
-    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
-    main_prog._current_role = OpRole.Backward
-    for p, g in params_grads:
-        # create master parameters
-        master_param = copy_to_master_param(p, main_prog.global_block())
-        startup_master_param = startup_prog.global_block()._clone_variable(
-            master_param)
-        startup_p = startup_prog.global_block().var(p.name)
-        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
-        # cast fp16 gradients to fp32 before apply gradients
-        if g.name.find("layer_norm") > -1:
-            if loss_scaling > 1:
-                scaled_g = g / float(loss_scaling)
-            else:
-                scaled_g = g
-            master_params_grads.append([p, scaled_g])
-            continue
-        master_grad = fluid.layers.cast(g, "float32")
-        if loss_scaling > 1:
-            master_grad = master_grad / float(loss_scaling)
-        master_params_grads.append([master_param, master_grad])
-    main_prog._current_role = tmp_role
-    return master_params_grads
-def master_param_to_train_param(master_params_grads, params_grads, main_prog):
-    for idx, m_p_g in enumerate(master_params_grads):
-        train_p, _ = params_grads[idx]
-        if train_p.name.find("layer_norm") > -1:
-            continue
-        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
-            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/demo/bert-cls/utils/init.py
+++ b/demo/bert-cls/utils/init.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import six
-import ast
-import copy
-import numpy as np
-import paddle.fluid as fluid
-def cast_fp32_to_fp16(exe, main_program):
-    print("Cast parameters to float16 data format.")
-    for param in main_program.global_block().all_parameters():
-        if not param.name.endswith(".master"):
-            param_t = fluid.global_scope().find_var(param.name).get_tensor()
-            data = np.array(param_t)
-            if param.name.find("layer_norm") == -1:
-                param_t.set(np.float16(data).view(np.uint16), exe.place)
-            master_param_var = fluid.global_scope().find_var(param.name +
-                                                             ".master")
-            if master_param_var is not None:
-                master_param_var.get_tensor().set(data, exe.place)
-def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
-    assert os.path.exists(
-        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
-    def existed_persitables(var):
-        if not fluid.io.is_persistable(var):
-            return False
-        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
-    fluid.io.load_vars(
-        exe,
-        init_checkpoint_path,
-        main_program=main_program,
-        predicate=existed_persitables)
-    print("Load model from {}".format(init_checkpoint_path))
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
-def init_pretraining_params(exe,
-                            pretraining_params_path,
-                            main_program,
-                            use_fp16=False):
-    assert os.path.exists(pretraining_params_path
-                          ), "[%s] cann't be found." % pretraining_params_path
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            print("param {} not exsist!".format(var.name))
-            return False
-        return os.path.exists(os.path.join(pretraining_params_path, var.name))
-    fluid.io.load_vars(
-        exe,
-        pretraining_params_path,
-        main_program=main_program,
-        predicate=existed_params)
-    print(
-        "Load pretraining parameters from {}.".format(pretraining_params_path))
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
--- a/paddle_hub/__init__.py
+++ b/paddle_hub/__init__.py
@@ -14,6 +14,7 @@
 from . import module
 from . import common
 from . import io
+from . import dataset
 from .common.dir import USER_HOME
 from .common.dir import HUB_HOME
@@ -34,3 +35,5 @@ from .finetune.network import append_mlp_classifier
 from .finetune.finetune import finetune_and_eval
 from .finetune.config import FinetuneConfig
 from .finetune.task import Task
+from .reader import BERTTokenizeReader
--- a/paddle_hub/common/dir.py
+++ b/paddle_hub/common/dir.py
@@ -14,6 +14,7 @@
 import os
+# TODO: Change dir.py's filename, this naming rule is not qualified
 USER_HOME = os.path.expanduser('~')
 HUB_HOME = os.path.join(USER_HOME, ".hub")
 MODULE_HOME = os.path.join(HUB_HOME, "modules")

--- a/paddle_hub/common/downloader.py
+++ b/paddle_hub/common/downloader.py
@@ -88,7 +88,7 @@ class Downloader:
                            done = int(50 * dl / total_length)
                            if time.time() - starttime >= FLUSH_INTERVAL:
                                sys.stdout.write(
-                                    "\r%s : [%-50s]%.2f%%" %
+                                    "\r%s : [%-50s] %.2f%%" %
                                    (save_name, '=' * done,
                                     float(dl / total_length * 100)))
                                starttime = time.time()

--- a/paddle_hub/common/logger.py
+++ b/paddle_hub/common/logger.py
@@ -19,8 +19,7 @@ import logging
 import math
-class Logger:
+class Logger(object):
    PLACEHOLDER = '%'
    NOLOG = "NOLOG"
@@ -29,7 +28,7 @@ class Logger:
            format='[%(asctime)-15s] [%(levelname)8s] - %(message)s')
        if not name:
-            name = "paddle-hub"
+            name = "PaddleHub"
        self.logger = logging.getLogger(name)
        self.logLevel = "DEBUG"

--- a/paddle_hub/dataset/__init__.py
+++ b/paddle_hub/dataset/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dataset import InputExample, HubDataset
+from .chnsenticorp import ChnSentiCorp
+from .msra_ner import MSRA_NER
--- a/paddle_hub/dataset/chnsenticorp.py
+++ b/paddle_hub/dataset/chnsenticorp.py
@@ -12,31 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_hub.tools.downloader import default_downloader
+from paddle_hub.common.downloader import default_downloader
-from paddle_hub.dir import DATA_HOME
+from paddle_hub.common.dir import DATA_HOME
 import os
 import csv
+from paddle_hub.dataset import InputExample
+from paddle_hub.dataset import HubDataset
 from collections import namedtuple
 DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/chnsenticorp_data.tar.gz"
-class HubDataset(object):
-    def get_train_examples(self):
-        raise NotImplementedError()
-    def get_dev_examples(self):
-        raise NotImplementedError()
-    def get_test_examples(self):
-        raise NotImplementedError()
-    def get_val_examples(self):
-        return self.get_dev_examples()
 class ChnSentiCorp(HubDataset):
+    """
+    ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
+    opinion mining)
+    """
    def __init__(self):
        ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
            url=DATA_URL, save_path=DATA_HOME, print_progress=True)
@@ -66,15 +60,20 @@ class ChnSentiCorp(HubDataset):
    def get_test_examples(self):
        return self.test_examples
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            Example = namedtuple('Example', ["label", "text_a"])
            examples = []
+            seq_id = 0
            for line in reader:
-                example = Example(*line)
+                example = InputExample(
+                    guid=seq_id, label=line[0], text_a=line[1])
+                seq_id += 1
                examples.append(example)
            return examples
@@ -82,5 +81,5 @@ class ChnSentiCorp(HubDataset):
 if __name__ == "__main__":
    ds = ChnSentiCorp()
-    for e in ds.get_train_example():
+    for e in ds.get_train_examples():
        print(e)
--- a/paddle_hub/dataset/dataset.py
+++ b/paddle_hub/dataset/dataset.py
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class InputExample(object):
+    """
+    Input data structure of BERT/ERNIE, can satisfy single sequence task like
+    text classification, sequence lableing; Sequence pair task like dialog
+    task.
+    """
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+class HubDataset(object):
+    def get_train_examples(self):
+        raise NotImplementedError()
+    def get_dev_examples(self):
+        raise NotImplementedError()
+    def get_test_examples(self):
+        raise NotImplementedError()
+    def get_val_examples(self):
+        return self.get_dev_examples()
+    def get_labels(self):
+        raise NotImplementedError()
--- a/paddle_hub/dataset/msra_ner.py
+++ b/paddle_hub/dataset/msra_ner.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle_hub.tools.downloader import default_downloader
+from paddle_hub.common.downloader import default_downloader
-from paddle_hub.dir import DATA_HOME
+from paddle_hub.common.dir import DATA_HOME
 import os
 import csv
@@ -28,7 +28,6 @@ class MSRA_NER(object):
        ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
            url=DATA_URL, save_path=DATA_HOME, print_progress=True)
-        print(self.dataset_dir)
        self._load_label_map()
        self._load_train_examples()
@@ -44,6 +43,10 @@ class MSRA_NER(object):
    def get_train_examples(self):
        return self.train_examples
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:

--- a/paddle_hub/dataset/task_reader.py
+++ b/paddle_hub/dataset/task_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import csv
-import json
-import numpy as np
-from collections import namedtuple
-import tokenization
-from batching import pad_batch_data
-class BaseReader(object):
-    def __init__(self,
-                 vocab_path,
-                 label_map_config=None,
-                 max_seq_len=512,
-                 do_lower_case=True,
-                 in_tokens=False,
-                 random_seed=None):
-        self.max_seq_len = max_seq_len
-        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
-        self.vocab = self.tokenizer.vocab
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.in_tokens = in_tokens
-        np.random.seed(random_seed)
-        self.current_example = 0
-        self.current_epoch = 0
-        self.num_examples = 0
-        if label_map_config:
-            with open(label_map_config) as f:
-                self.label_map = json.load(f)
-        else:
-            self.label_map = None
-        pass
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_example, self.current_epoch
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            headers = next(reader)
-            Example = namedtuple('Example', headers)
-            examples = []
-            for line in reader:
-                example = Example(*line)
-                examples.append(example)
-            return examples
-    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
-        """Truncates a sequence pair in place to the maximum length."""
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                break
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        """Converts a single `Example` into a single `Record`."""
-        text_a = tokenization.convert_to_unicode(example.text_a)
-        tokens_a = tokenizer.tokenize(text_a)
-        tokens_b = None
-        if "text_b" in example._fields:
-            text_b = tokenization.convert_to_unicode(example.text_b)
-            tokens_b = tokenizer.tokenize(text_b)
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[0:(max_seq_length - 2)]
-        # The convention in BERT/ERNIE is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0     0   0   0  0     0 0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        text_type_ids = []
-        tokens.append("[CLS]")
-        text_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            text_type_ids.append(0)
-        tokens.append("[SEP]")
-        text_type_ids.append(0)
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                text_type_ids.append(1)
-            tokens.append("[SEP]")
-            text_type_ids.append(1)
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        if self.label_map:
-            label_id = self.label_map[example.label]
-        else:
-            label_id = example.label
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
-        qid = None
-        if "qid" in example._fields:
-            qid = example.qid
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_id=label_id,
-            qid=qid)
-        return record
-    def _prepare_batch_data(self, examples, batch_size, phase=None):
-        """generate batch records"""
-        batch_records, max_len = [], 0
-        for index, example in enumerate(examples):
-            if phase == "train":
-                self.current_example = index
-            record = self._convert_example_to_record(example, self.max_seq_len,
-                                                     self.tokenizer)
-            max_len = max(max_len, len(record.token_ids))
-            if self.in_tokens:
-                to_append = (len(batch_records) + 1) * max_len <= batch_size
-            else:
-                to_append = len(batch_records) < batch_size
-            if to_append:
-                batch_records.append(record)
-            else:
-                yield self._pad_batch_records(batch_records)
-                batch_records, max_len = [record], len(record.token_ids)
-        if len(batch_records) > 0:
-            yield self._pad_batch_records(batch_records)
-    def get_num_examples(self, input_file):
-        examples = self._read_tsv(input_file)
-        return len(examples)
-    def data_generator(self,
-                       input_file,
-                       batch_size,
-                       epoch,
-                       shuffle=True,
-                       phase=None):
-        examples = self._read_tsv(input_file)
-        def wrapper():
-            for epoch_index in range(epoch):
-                if phase == "train":
-                    self.current_example = 0
-                    self.current_epoch = epoch_index
-                if shuffle:
-                    np.random.shuffle(examples)
-                for batch_data in self._prepare_batch_data(
-                        examples, batch_size, phase=phase):
-                    yield batch_data
-        return wrapper
-class ClassifyReader(BaseReader):
-    def _read_tsv(self, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            headers = next(reader)
-            text_indices = [
-                index for index, h in enumerate(headers) if h != "label"
-            ]
-            Example = namedtuple('Example', headers)
-            examples = []
-            for line in reader:
-                for index, text in enumerate(line):
-                    if index in text_indices:
-                        line[index] = text.replace(' ', '')
-                example = Example(*line)
-                examples.append(example)
-            return examples
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_labels = [record.label_id for record in batch_records]
-        batch_labels = np.array(batch_labels).astype("int64").reshape([-1, 1])
-        if batch_records[0].qid:
-            batch_qids = [record.qid for record in batch_records]
-            batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
-        else:
-            batch_qids = np.array([]).astype("int64").reshape([-1, 1])
-        # padding
-        padded_token_ids, next_sent_index, self_attn_bias = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_next_sent_pos=True,
-            return_attn_bias=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            self_attn_bias, batch_labels, next_sent_index, batch_qids
-        ]
-        return return_list
-class SequenceLabelReader(BaseReader):
-    def _pad_batch_records(self, batch_records):
-        batch_token_ids = [record.token_ids for record in batch_records]
-        batch_text_type_ids = [record.text_type_ids for record in batch_records]
-        batch_position_ids = [record.position_ids for record in batch_records]
-        batch_label_ids = [record.label_ids for record in batch_records]
-        batch_seq_lens = [len(record.token_ids) for record in batch_records]
-        # padding
-        padded_token_ids, self_attn_bias = pad_batch_data(
-            batch_token_ids,
-            pad_idx=self.pad_id,
-            return_next_sent_pos=False,
-            return_attn_bias=True)
-        padded_text_type_ids = pad_batch_data(
-            batch_text_type_ids, pad_idx=self.pad_id)
-        padded_position_ids = pad_batch_data(
-            batch_position_ids, pad_idx=self.pad_id)
-        padded_label_ids = pad_batch_data(
-            batch_label_ids, pad_idx=len(self.label_map) - 1)
-        batch_seq_lens = np.array(batch_seq_lens).astype("int64").reshape(
-            [-1, 1])
-        return_list = [
-            padded_token_ids, padded_text_type_ids, padded_position_ids,
-            self_attn_bias, padded_label_ids, batch_seq_lens
-        ]
-        return return_list
-    def _reseg_token_label(self, tokens, labels, tokenizer):
-        assert len(tokens) == len(labels)
-        ret_tokens = []
-        ret_labels = []
-        for token, label in zip(tokens, labels):
-            sub_token = tokenizer.tokenize(token)
-            if len(sub_token) == 0:
-                continue
-            ret_tokens.extend(sub_token)
-            ret_labels.append(label)
-            if len(sub_token) < 2:
-                continue
-            sub_label = label
-            if label.startswith("B-"):
-                sub_label = "I-" + label[2:]
-            ret_labels.extend([sub_label] * (len(sub_token) - 1))
-        assert len(ret_tokens) == len(ret_labels)
-        return ret_tokens, ret_labels
-    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
-        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
-        labels = tokenization.convert_to_unicode(example.label).split(u"")
-        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
-        if len(tokens) > max_seq_length - 2:
-            tokens = tokens[0:(max_seq_length - 2)]
-            labels = labels[0:(max_seq_length - 2)]
-        tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        token_ids = tokenizer.convert_tokens_to_ids(tokens)
-        position_ids = list(range(len(token_ids)))
-        text_type_ids = [0] * len(token_ids)
-        no_entity_id = len(self.label_map) - 1
-        label_ids = [no_entity_id
-                     ] + [self.label_map[label]
-                          for label in labels] + [no_entity_id]
-        Record = namedtuple(
-            'Record',
-            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
-        record = Record(
-            token_ids=token_ids,
-            text_type_ids=text_type_ids,
-            position_ids=position_ids,
-            label_ids=label_ids)
-        return record
-if __name__ == '__main__':
-    pass
--- a/paddle_hub/dataset/tokenization.py
+++ b/paddle_hub/dataset/tokenization.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import collections
-import unicodedata
-import six
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    fin = open(vocab_file)
-    for num, line in enumerate(fin):
-        items = convert_to_unicode(line.strip()).split("\t")
-        if len(items) > 2:
-            break
-        token = items[0]
-        index = items[1] if len(items) == 2 else num
-        token = token.strip()
-        vocab[token] = int(index)
-    return vocab
-def convert_by_vocab(vocab, items):
-    """Converts a sequence of [tokens|ids] using the vocab."""
-    output = []
-    for item in items:
-        output.append(vocab[item])
-    return output
-def convert_tokens_to_ids(vocab, tokens):
-    return convert_by_vocab(vocab, tokens)
-def convert_ids_to_tokens(inv_vocab, ids):
-    return convert_by_vocab(inv_vocab, ids)
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a peice of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-    def tokenize(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-class CharTokenizer(object):
-    """Runs end-to-end tokenziation."""
-    def __init__(self, vocab_file, do_lower_case=True):
-        self.vocab = load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-    def tokenize(self, text):
-        split_tokens = []
-        for token in text.lower().split(" "):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-    def convert_tokens_to_ids(self, tokens):
-        return convert_by_vocab(self.vocab, tokens)
-    def convert_ids_to_tokens(self, ids):
-        return convert_by_vocab(self.inv_vocab, ids)
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-    def __init__(self, do_lower_case=True):
-        """Constructs a BasicTokenizer.
-        Args:
-            do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-        return ["".join(x) for x in output]
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-        return False
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-        For example:
-            input = "unaffable"
-            output = ["un", "##aff", "##able"]
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through `BasicTokenizer.
-        Returns:
-            A list of wordpiece tokens.
-        """
-        text = convert_to_unicode(text)
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
-            or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
--- a/paddle_hub/finetune/checkpoint.py
+++ b/paddle_hub/finetune/checkpoint.py
@@ -36,14 +36,17 @@ def load_checkpoint(checkpoint_dir, exe):
        fluid.io.load_persistables(exe, ckpt.latest_model_dir)
-        logger.info("Checkpoint loaded. current_epoch={},"
+        logger.info("PaddleHub model checkpoint loaded. current_epoch={}, "
-                    "global_step={}".format(current_epoch, global_step))
+                    "global_step={}".format(ckpt.current_epoch,
+                                            ckpt.global_step))
        return ckpt.current_epoch, ckpt.global_step
    else:
        current_epoch = 1
        global_step = 0
        latest_model_dir = None
-        logger.info("Checkpoint not found, start training from scratch...")
+        logger.info(
+            "PaddleHub model checkpoint not found, start training from scratch..."
+        )
        exe.run(fluid.default_startup_program())
        return current_epoch, global_step

--- a/paddle_hub/finetune/finetune.py
+++ b/paddle_hub/finetune/finetune.py
@@ -40,7 +40,6 @@ def _get_running_device_info(config):
 def _do_memory_optimization(task, config):
    if config.enable_memory_optim:
        logger.info("Memory optimization start...")
        task_var_name = task.metric_variable_names()
@@ -56,7 +55,7 @@ def _do_memory_optimization(task, config):
    lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
        program=fluid.default_main_program(), batch_size=config.batch_size)
-    logger.info("Theoretical memory usage in training: %.3f - %.3f %s" %
+    logger.info("Theoretical memory usage in training: %.2f - %.2f %s" %
                (lower_mem, upper_mem, unit)),
@@ -102,6 +101,7 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
            eval_loss_scalar = logw.scalar(tag="loss[evaluate]")
            eval_acc_scalar = logw.scalar(tag="accuracy[evaluate]")
+        # Finetune loop
        for epoch in range(current_epoch, num_epoch + 1):
            train_reader = data_reader.data_generator(
                batch_size=batch_size, phase='train')
@@ -134,9 +134,6 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
                    num_trained_examples = acc_sum = loss_sum = 0
                if global_step % config.save_ckpt_interval == 0:
-                    model_saved_dir = os.path.join(config.checkpoint_dir,
-                                                   "step_%d" % global_step)
-                    fluid.io.save_persistables(exe, dirname=model_saved_dir)
                    # NOTE: current saved checkpoint machanism is not completed,
                    # it can't restore dataset training status
                    save_checkpoint(
@@ -163,9 +160,6 @@ def _finetune_model(task, data_reader, feed_list, config=None, do_eval=False):
                            (model_saved_dir, best_eval_acc))
                        fluid.io.save_persistables(exe, dirname=model_saved_dir)
-        # update model and checkpoint
-        model_saved_dir = os.path.join(config.checkpoint_dir, "final_model")
-        fluid.io.save_persistables(exe, dirname=model_saved_dir)
        # NOTE: current saved checkpoint machanism is not completed, it can't
        # resotre dataset training status
        save_checkpoint(
@@ -188,6 +182,7 @@ def finetune(task, data_reader, feed_list, config=None):
 def evaluate(task, data_reader, feed_list, phase="test", config=None):
+    logger.info("Evaluation on {} dataset start".format(phase))
    inference_program = task.inference_program()
    main_program = task.main_program()
    loss = task.variable("loss")
@@ -216,7 +211,8 @@ def evaluate(task, data_reader, feed_list, phase="test", config=None):
        avg_loss = loss_sum / num_eval_examples
        avg_acc = acc_sum / num_eval_examples
        eval_speed = eval_step / eval_time_used
-    logger.info("[evaluation on %s set] loss=%.5f acc=%.5f [step/sec: %.2f]" %
+    logger.info(
-                (phase, avg_loss, avg_acc, eval_speed))
+        "[%s dataset evaluation result] loss=%.5f acc=%.5f [step/sec: %.2f]" %
+        (phase, avg_loss, avg_acc, eval_speed))
    return avg_loss, avg_acc, eval_speed
--- a/paddle_hub/module/module.py
+++ b/paddle_hub/module/module.py
@@ -104,6 +104,7 @@ class Module(object):
        self.module_info = None
        self.processor = None
        self.name = "temp"
+        # TODO(wuzewu): print more module loading info log
        if url:
            self._init_with_url(url=url)
        elif module_dir:

--- a/paddle_hub/reader/__init__.py
+++ b/paddle_hub/reader/__init__.py
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .nlp_reader import BERTTokenizeReader
--- a/demo/bert-cls/reader/batching.py
+++ b/demo/bert-cls/reader/batching.py
@@ -188,7 +188,3 @@ def pad_batch_data(insts,
        return_list += [num_token]
    return return_list if len(return_list) > 1 else return_list[0]
-if __name__ == "__main__":
-    pass
--- a/demo/bert-cls/reader/cls.py
+++ b/demo/bert-cls/reader/cls.py
@@ -16,52 +16,50 @@ import os
 import types
 import csv
 import numpy as np
-import tokenization
+#from paddle_hub import dataset
+from paddle_hub.reader import tokenization
 from .batching import prepare_batch_data
-class DataProcessor(object):
+class BERTTokenizeReader(object):
    """Base class for data converters for sequence classification data sets."""
    def __init__(self,
-                 data_dir,
+                 dataset,
                 vocab_path,
                 max_seq_len,
                 do_lower_case=True,
-                 in_tokens=False,
                 random_seed=None):
-        self.data_dir = data_dir
+        self.dataset = dataset
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
-        self.in_tokens = in_tokens
        np.random.seed(random_seed)
-        self.current_train_example = -1
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
-        self.current_train_epoch = -1
-    def get_train_examples(self, data_dir):
+    def get_train_examples(self):
        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
+        return self.dataset.get_train_examples()
-    def get_dev_examples(self, data_dir):
+    def get_dev_examples(self):
        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
+        return self.dataset.get_dev_examples()
-    def get_val_examples(self, data_dir):
+    def get_val_examples(self):
        """Gets a collection of `InputExample`s for the val set."""
-        raise NotImplementedError()
+        return self.dataset.get_val_examples()
-    def get_test_examples(self, data_dir):
+    def get_test_examples(self):
        """Gets a collection of `InputExample`s for prediction."""
-        raise NotImplementedError()
+        return self.dataset.get_test_examples()
    def get_labels(self):
        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
+        return self.dataset.get_labels()
    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
        """Converts a single `InputExample` into a single `InputFeatures`."""
@@ -76,9 +74,10 @@ class DataProcessor(object):
        Args:
            feature: InputFeatures(object). A single set of features of data.
        """
-        input_pos = list(range(len(feature.input_ids)))
+        position_ids = list(range(len(feature.input_ids)))
        return [
-            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
+            feature.input_ids, feature.segment_ids, position_ids,
+            feature.label_id
        ]
    def generate_batch_data(self,
@@ -87,7 +86,7 @@ class DataProcessor(object):
                            voc_size=-1,
                            mask_id=-1,
                            return_input_mask=True,
-                            return_max_len=True,
+                            return_max_len=False,
                            return_num_token=False):
        return prepare_batch_data(
            batch_data,
@@ -99,19 +98,9 @@ class DataProcessor(object):
            sep_id=self.vocab["[SEP]"],
            mask_id=-1,
            return_input_mask=return_input_mask,
-            return_max_len=True,
+            return_max_len=return_max_len,
            return_num_token=return_num_token)
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                lines.append(line)
-            return lines
    def get_num_examples(self, phase):
        """Get number of examples for train, dev or test."""
        if phase not in ['train', 'val', 'dev', 'test']:
@@ -120,13 +109,9 @@ class DataProcessor(object):
            )
        return self.num_examples[phase]
-    def get_train_progress(self):
+    def data_generator(self, batch_size, phase='train', shuffle=True):
-        """Gets progress for training phase."""
-        return self.current_train_example, self.current_train_epoch
-    def data_generator(self, batch_size, phase='train', epoch=1, shuffle=True):
        """
-        Generate data for train, dev or test.
+        Generate data for train, dev/val or test.
        Args:
          batch_size: int. The batch size of generated data.
@@ -135,59 +120,49 @@ class DataProcessor(object):
          shuffle: bool. Whether to shuffle examples.
        """
        if phase == 'train':
-            examples = self.get_train_examples(self.data_dir)
+            examples = self.get_train_examples()
            self.num_examples['train'] = len(examples)
        elif phase == 'val' or phase == 'dev':
-            examples = self.get_dev_examples(self.data_dir)
+            examples = self.get_dev_examples()
            self.num_examples['dev'] = len(examples)
        elif phase == 'test':
-            examples = self.get_test_examples(self.data_dir)
+            examples = self.get_test_examples()
            self.num_examples['test'] = len(examples)
        else:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'test'].")
        def instance_reader():
-            for epoch_index in range(epoch):
+            """
-                if shuffle:
+            convert a single instance to BERT input feature
-                    np.random.shuffle(examples)
+            """
-                if phase == 'train':
+            if shuffle:
-                    self.current_train_epoch = epoch_index
+                np.random.shuffle(examples)
-                for (index, example) in enumerate(examples):
+            for (index, example) in enumerate(examples):
-                    if phase == 'train':
+                feature = self.convert_example(index, example,
-                        self.current_train_example = index + 1
+                                               self.get_labels(),
-                    feature = self.convert_example(index, example,
+                                               self.max_seq_len, self.tokenizer)
-                                                   self.get_labels(),
-                                                   self.max_seq_len,
+                instance = self.generate_instance(feature)
-                                                   self.tokenizer)
+                yield instance
-                    instance = self.generate_instance(feature)
+        def batch_reader(reader, batch_size):
-                    yield instance
-        def batch_reader(reader, batch_size, in_tokens):
            batch, total_token_num, max_len = [], 0, 0
            for instance in reader():
                token_ids, sent_ids, pos_ids, label = instance[:4]
                max_len = max(max_len, len(token_ids))
-                if in_tokens:
+                batch.append(instance)
-                    to_append = (len(batch) + 1) * max_len <= batch_size
+                total_token_num += len(token_ids)
-                else:
+                if len(batch) == batch_size:
-                    to_append = len(batch) < batch_size
-                if to_append:
-                    batch.append(instance)
-                    total_token_num += len(token_ids)
-                else:
                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [
+                    batch, total_token_num, max_len = [], 0, 0
-                        instance
-                    ], len(token_ids), len(token_ids)
            if len(batch) > 0:
                yield batch, total_token_num
        def wrapper():
            for batch_data, total_token_num in batch_reader(
-                    instance_reader, batch_size, self.in_tokens):
+                    instance_reader, batch_size):
                batch_data = self.generate_batch_data(
                    batch_data,
                    total_token_num,
@@ -201,27 +176,6 @@ class DataProcessor(object):
        return wrapper
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-    Args:
-      guid: Unique id for the example.
-      text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-      text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-      label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
@@ -249,271 +203,6 @@ class InputFeatures(object):
        self.label_id = label_id
-class XnliProcessor(DataProcessor):
-    """Processor for the XNLI data set."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(
-            os.path.join(data_dir, "multinli",
-                         "multinli.train.%s.tsv" % self.language))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "train-%d" % (i)
-            text_a = tokenization.convert_to_unicode(line[0])
-            text_b = tokenization.convert_to_unicode(line[1])
-            label = tokenization.convert_to_unicode(line[2])
-            if label == tokenization.convert_to_unicode("contradictory"):
-                label = tokenization.convert_to_unicode("contradiction")
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "dev-%d" % (i)
-            language = tokenization.convert_to_unicode(line[0])
-            if language != tokenization.convert_to_unicode(self.language):
-                continue
-            text_a = tokenization.convert_to_unicode(line[6])
-            text_b = tokenization.convert_to_unicode(line[7])
-            label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        self.language = "zh"
-        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "test-%d" % (i)
-            language = tokenization.convert_to_unicode(line[0])
-            if language != tokenization.convert_to_unicode(self.language):
-                continue
-            text_a = tokenization.convert_to_unicode(line[6])
-            text_b = tokenization.convert_to_unicode(line[7])
-            label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-            "dev_matched")
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(
-                line[0]))
-            text_a = tokenization.convert_to_unicode(line[8])
-            text_b = tokenization.convert_to_unicode(line[9])
-            if set_type == "test":
-                label = "contradiction"
-            else:
-                label = tokenization.convert_to_unicode(line[-1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = tokenization.convert_to_unicode(line[3])
-            text_b = tokenization.convert_to_unicode(line[4])
-            if set_type == "test":
-                label = "0"
-            else:
-                label = tokenization.convert_to_unicode(line[0])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            # Only the test set has a header
-            if set_type == "test" and i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            if set_type == "test":
-                text_a = tokenization.convert_to_unicode(line[1])
-                label = "0"
-            else:
-                text_a = tokenization.convert_to_unicode(line[3])
-                label = tokenization.convert_to_unicode(line[1])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-class ChnsenticorpProcessor(DataProcessor):
-    """Processor for the Chnsenticorp data set."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = tokenization.convert_to_unicode(line[1])
-            label = tokenization.convert_to_unicode(line[0])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-class BERTClassifyReader(DataProcessor):
-    """Processor for the Chnsenticorp data set."""
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = tokenization.convert_to_unicode(line[1])
-            label = tokenization.convert_to_unicode(line[0])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
 def convert_single_example_to_unicode(guid, single_example):
    text_a = tokenization.convert_to_unicode(single_example[0])
    text_b = tokenization.convert_to_unicode(single_example[1])

--- a/demo/bert-cls/tokenization.py
+++ b/demo/bert-cls/tokenization.py