import copy
import collections
import json
import os
import warnings

from paddle.dataset.common import md5file
from paddle.utils.download import get_path_from_url
from paddle.io import Dataset
from paddlenlp.utils.env import DATA_HOME
from paddlenlp.transformers.tokenizer_utils import _is_whitespace, _is_control, convert_to_unicode

__all__ = ['SQuAD', 'DuReaderRobust', 'CMRC', 'DRCD']


class SquadExample(object):
    """A single training/test example for simple sequence classification.

     For examples without an answer, the start and end position are -1.
  """

    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None,
                 is_impossible=False):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
                 unique_id,
                 example_index,
                 doc_span_index,
                 tokens,
                 token_to_orig_map,
                 token_is_max_context,
                 input_ids,
                 input_mask,
                 segment_ids,
                 start_position=None,
                 end_position=None,
                 is_impossible=None):
        self.unique_id = unique_id
        self.example_index = example_index
        self.doc_span_index = doc_span_index
        self.tokens = tokens
        self.token_to_orig_map = token_to_orig_map
        self.token_is_max_context = token_is_max_context
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible


class SQuAD(Dataset):
    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))

    DEV_DATA_URL_V2 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/dev-v2.0.json'
    TRAIN_DATA_URL_V2 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/train-v2.0.json'

    DEV_DATA_URL_V1 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/dev-v1.1.json'
    TRAIN_DATA_URL_V1 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/train-v1.1.json'

    SPLITS = {
        '1.1': {
            'train': META_INFO(
                os.path.join('v1', 'train-v1.1.json'),
                '981b29407e0affa3b1b156f72073b945'),
            'dev': META_INFO(
                os.path.join('v1', 'dev-v1.1.json'),
                '3e85deb501d4e538b6bc56f786231552')
        },
        '2.0': {
            'train': META_INFO(
                os.path.join('v2', 'train-v2.0.json'),
                '62108c273c268d70893182d5cf8df740'),
            'dev': META_INFO(
                os.path.join('v2', 'dev-v2.0.json'),
                '246adae8b7002f8679c027697b0b7cf8')
        }
    }

    def __init__(self,
                 tokenizer,
                 mode='train',
                 version_2_with_negative=False,
                 root=None,
                 doc_stride=128,
                 max_query_length=64,
                 max_seq_length=512,
                 **kwargs):

        self.version_2_with_negative = version_2_with_negative
        self._get_data(root, mode, **kwargs)
        self.tokenizer = tokenizer
        self.doc_stride = doc_stride
        self.max_query_length = max_query_length
        self.max_seq_length = max_seq_length

        self._transform_func = None

        if mode == 'train':
            self.is_training = True
        else:
            self.is_training = False

        self._read()

        self.features = self.convert_examples_to_feature(
            self.examples,
            tokenizer=self.tokenizer,
            doc_stride=self.doc_stride,
            max_query_length=self.max_query_length,
            max_seq_length=self.max_seq_length)

    def _get_data(self, root, mode, **kwargs):
        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
        if self.version_2_with_negative:
            filename, data_hash = self.SPLITS['2.0'][mode]
        else:
            filename, data_hash = self.SPLITS['1.1'][mode]
        fullname = os.path.join(default_root,
                                filename) if root is None else os.path.join(
                                    os.path.expanduser(root), filename)
        if not os.path.exists(fullname) or (data_hash and
                                            not md5file(fullname) == data_hash):
            if root is not None:  # not specified, and no need to warn
                warnings.warn(
                    'md5 check failed for {}, download {} data to {}'.format(
                        filename, self.__class__.__name__, default_root))
            if mode == 'train':
                if self.version_2_with_negative:
                    fullname = get_path_from_url(
                        self.TRAIN_DATA_URL_V2,
                        os.path.join(default_root, 'v2'))
                else:
                    fullname = get_path_from_url(
                        self.TRAIN_DATA_URL_V1,
                        os.path.join(default_root, 'v1'))
            elif mode == 'dev':
                if self.version_2_with_negative:
                    fullname = get_path_from_url(
                        self.DEV_DATA_URL_V2, os.path.join(default_root, 'v2'))
                else:
                    fullname = get_path_from_url(
                        self.DEV_DATA_URL_V1, os.path.join(default_root, 'v1'))
        self.full_path = fullname

    def convert_examples_to_feature(self, examples, tokenizer, max_seq_length,
                                    doc_stride, max_query_length):
        """Loads a data file into a list of `InputBatch`s."""
        unique_id = 1000000000
        features = []
        for (example_index, example) in enumerate(examples):
            query_tokens = tokenizer._tokenize(example.question_text)
            if len(query_tokens) > max_query_length:
                query_tokens = query_tokens[0:max_query_length]

            tok_to_orig_index = []
            orig_to_tok_index = []
            all_doc_tokens = []
            for (i, token) in enumerate(example.doc_tokens):
                orig_to_tok_index.append(len(all_doc_tokens))
                sub_tokens = tokenizer._tokenize(token)
                for sub_token in sub_tokens:
                    tok_to_orig_index.append(i)
                    all_doc_tokens.append(sub_token)

            tok_start_position = None
            tok_end_position = None
            if self.is_training and example.is_impossible:
                tok_start_position = -1
                tok_end_position = -1
            if self.is_training and not example.is_impossible:
                tok_start_position = orig_to_tok_index[example.start_position]
                if example.end_position < len(example.doc_tokens) - 1:
                    tok_end_position = orig_to_tok_index[example.end_position +
                                                         1] - 1
                else:
                    tok_end_position = len(all_doc_tokens) - 1
                (tok_start_position,
                 tok_end_position) = self._improve_answer_span(
                     all_doc_tokens, tok_start_position, tok_end_position,
                     tokenizer, example.orig_answer_text)

            # The -3 accounts for [CLS], [SEP] and [SEP]
            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

            # We can have documents that are longer than the maximum sequence length.
            # To deal with this we do a sliding window approach, where we take chunks
            # of the up to our max length with a stride of `doc_stride`.
            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
                "DocSpan", ["start", "length"])
            doc_spans = []
            start_offset = 0
            while start_offset < len(all_doc_tokens):
                length = len(all_doc_tokens) - start_offset
                if length > max_tokens_for_doc:
                    length = max_tokens_for_doc
                doc_spans.append(_DocSpan(start=start_offset, length=length))
                if start_offset + length == len(all_doc_tokens):
                    break
                start_offset += min(length, doc_stride)

            for (doc_span_index, doc_span) in enumerate(doc_spans):
                tokens = []
                token_to_orig_map = {}
                token_is_max_context = {}
                segment_ids = []
                tokens.append("[CLS]")
                segment_ids.append(0)
                for token in query_tokens:
                    tokens.append(token)
                    segment_ids.append(0)
                tokens.append("[SEP]")
                segment_ids.append(0)

                for i in range(doc_span.length):
                    split_token_index = doc_span.start + i
                    token_to_orig_map[len(tokens)] = tok_to_orig_index[
                        split_token_index]

                    is_max_context = self._check_is_max_context(
                        doc_spans, doc_span_index, split_token_index)
                    token_is_max_context[len(tokens)] = is_max_context
                    tokens.append(all_doc_tokens[split_token_index])
                    segment_ids.append(1)
                tokens.append("[SEP]")
                segment_ids.append(1)

                input_ids = tokenizer.convert_tokens_to_ids(tokens)

                input_mask = [1] * len(input_ids)

                start_position = None
                end_position = None
                if self.is_training and not example.is_impossible:
                    # For training, if our document chunk does not contain an annotation
                    # we throw it out, since there is nothing to predict.
                    doc_start = doc_span.start
                    doc_end = doc_span.start + doc_span.length - 1
                    out_of_span = False
                    if not (tok_start_position >= doc_start and
                            tok_end_position <= doc_end):
                        out_of_span = True
                    if out_of_span:
                        start_position = 0
                        end_position = 0
                    else:
                        doc_offset = len(query_tokens) + 2
                        start_position = tok_start_position - doc_start + doc_offset
                        end_position = tok_end_position - doc_start + doc_offset

                if self.is_training and example.is_impossible:
                    start_position = 0
                    end_position = 0

                feature = InputFeatures(
                    unique_id=unique_id,
                    example_index=example_index,
                    doc_span_index=doc_span_index,
                    tokens=tokens,
                    token_to_orig_map=token_to_orig_map,
                    token_is_max_context=token_is_max_context,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    start_position=start_position,
                    end_position=end_position,
                    is_impossible=example.is_impossible)

                unique_id += 1
                features.append(feature)
        return features

    def _improve_answer_span(self, doc_tokens, input_start, input_end,
                             tokenizer, orig_answer_text):
        """Returns tokenized answer spans that better match the annotated answer."""

        # The SQuAD annotations are character based. We first project them to
        # whitespace-tokenized words. But then after WordPiece tokenization, we can
        # often find a "better match". For example:
        #
        #   Question: What year was John Smith born?
        #   Context: The leader was John Smith (1895-1943).
        #   Answer: 1895
        #
        # The original whitespace-tokenized answer will be "(1895-1943).". However
        # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
        # the exact answer, 1895.
        #
        # However, this is not always possible. Consider the following:
        #
        #   Question: What country is the top exporter of electornics?
        #   Context: The Japanese electronics industry is the lagest in the world.
        #   Answer: Japan
        #
        # In this case, the annotator chose "Japan" as a character sub-span of
        # the word "Japanese". Since our WordPiece tokenizer does not split
        # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
        # in SQuAD, but does happen.
        tok_answer_text = " ".join(tokenizer._tokenize(orig_answer_text))

        for new_start in range(input_start, input_end + 1):
            for new_end in range(input_end, new_start - 1, -1):
                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
                if text_span == tok_answer_text:
                    return (new_start, new_end)

        return (input_start, input_end)

    def _check_is_max_context(self, doc_spans, cur_span_index, position):
        """Check if this is the 'max context' doc span for the token."""

        # Because of the sliding window approach taken to scoring documents, a single
        # token can appear in multiple documents. E.g.
        #  Doc: the man went to the store and bought a gallon of milk
        #  Span A: the man went to the
        #  Span B: to the store and bought
        #  Span C: and bought a gallon of
        #  ...
        #
        # Now the word 'bought' will have two scores from spans B and C. We only
        # want to consider the score with "maximum context", which we define as
        # the *minimum* of its left and right context (the *sum* of left and
        # right context will always be the same, of course).
        #
        # In the example the maximum context for 'bought' would be span C since
        # it has 1 left context and 3 right context, while span B has 4 left context
        # and 0 right context.
        best_score = None
        best_span_index = None
        for (span_index, doc_span) in enumerate(doc_spans):
            end = doc_span.start + doc_span.length - 1
            if position < doc_span.start:
                continue
            if position > end:
                continue
            num_left_context = position - doc_span.start
            num_right_context = end - position
            score = min(num_left_context,
                        num_right_context) + 0.01 * doc_span.length
            if best_score is None or score > best_score:
                best_score = score
                best_span_index = span_index

        return cur_span_index == best_span_index

    def _read(self):
        with open(self.full_path, "r", encoding="utf8") as reader:
            input_data = json.load(reader)["data"]

        examples = []
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                doc_tokens = []
                char_to_word_offset = []
                prev_is_whitespace = True
                for c in paragraph_text:
                    if _is_whitespace(c):
                        prev_is_whitespace = True
                    else:
                        if prev_is_whitespace:
                            doc_tokens.append(c)
                        else:
                            doc_tokens[-1] += c
                        prev_is_whitespace = False
                    char_to_word_offset.append(len(doc_tokens) - 1)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    is_impossible = False

                    if self.is_training:
                        if self.version_2_with_negative:
                            is_impossible = qa["is_impossible"]
                        if (len(qa["answers"]) != 1) and (not is_impossible):
                            raise ValueError(
                                "For training, each question should have exactly 1 answer."
                            )
                        if not is_impossible:
                            answer = qa["answers"][0]
                            orig_answer_text = answer["text"]
                            answer_offset = answer["answer_start"]
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            try:
                                end_position = char_to_word_offset[
                                    answer_offset + answer_length - 1]
                            except:
                                continue

                        else:
                            start_position = -1
                            end_position = -1
                            orig_answer_text = ""
                    else:
                        if self.version_2_with_negative:
                            is_impossible = qa["is_impossible"]
                        orig_answer_text = []
                        if not is_impossible and 'answers' in qa.keys():
                            answers = qa["answers"]
                            for answer in answers:
                                orig_answer_text.append(answer["text"])
                        else:
                            start_position = -1
                            end_position = -1
                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position,
                        is_impossible=is_impossible)
                    examples.append(example)

        self.examples = examples[:1000]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]

        if self.is_training:
            return feature.input_ids, feature.segment_ids, feature.unique_id, feature.start_position, feature.end_position
        else:
            return feature.input_ids, feature.segment_ids, feature.unique_id


class DuReaderRobust(SQuAD):
    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))

    DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz'

    SPLITS = {
        'train': META_INFO(
            os.path.join('dureader_robust-data', 'train.json'),
            '800a3dcb742f9fdf9b11e0a83433d4be'),
        'dev': META_INFO(
            os.path.join('dureader_robust-data', 'dev.json'),
            'ae73cec081eaa28a735204c4898a2222'),
        'test': META_INFO(
            os.path.join('dureader_robust-data', 'test.json'),
            'e0e8aa5c7b6d11b6fc3935e29fc7746f')
    }

    def __init__(self,
                 tokenizer,
                 mode='train',
                 root=None,
                 doc_stride=128,
                 max_query_length=64,
                 max_seq_length=512,
                 **kwargs):

        super(DuReaderRobust, self).__init__(
            tokenizer=tokenizer,
            mode=mode,
            version_2_with_negative=False,
            root=root,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            max_seq_length=max_seq_length,
            **kwargs)

    def _get_data(self, root, mode, **kwargs):
        default_root = os.path.join(DATA_HOME, 'self.__class__.__name__')

        filename, data_hash = self.SPLITS[mode]

        fullname = os.path.join(default_root,
                                filename) if root is None else os.path.join(
                                    os.path.expanduser(root), filename)
        if not os.path.exists(fullname) or (data_hash and
                                            not md5file(fullname) == data_hash):
            if root is not None:  # not specified, and no need to warn
                warnings.warn(
                    'md5 check failed for {}, download {} data to {}'.format(
                        filename, self.__class__.__name__, default_root))

            get_path_from_url(self.DATA_URL, default_root)

        self.full_path = fullname

    def _read(self):
        with open(self.full_path, "r", encoding="utf8") as reader:
            input_data = json.load(reader)["data"]

        examples = []
        for entry in input_data:
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                raw_doc_tokens = self.tokenizer.basic_tokenizer.tokenize(
                    paragraph_text)
                doc_tokens = []
                char_to_word_offset = []
                k = 0
                temp_word = ""
                for c in paragraph_text:
                    if not self.tokenizer.basic_tokenizer.tokenize(c):
                        char_to_word_offset.append(k - 1)
                        continue
                    else:
                        temp_word += c
                        char_to_word_offset.append(k)

                    if temp_word == raw_doc_tokens[k]:
                        doc_tokens.append(temp_word)
                        temp_word = ""
                        k += 1

                assert k == len(raw_doc_tokens)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None
                    is_impossible = False

                    if self.is_training:
                        if (len(qa["answers"]) != 1):
                            raise ValueError(
                                "For training, each question should have exactly 1 answer."
                            )

                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        try:
                            end_position = char_to_word_offset[
                                answer_offset + answer_length - 1]
                        except:
                            continue

                    else:
                        orig_answer_text = []
                        if 'answers' in qa.keys():
                            answers = qa["answers"]
                            for answer in answers:
                                orig_answer_text.append(answer["text"])

                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position,
                        is_impossible=is_impossible)
                    examples.append(example)

        self.examples = examples


class CMRC(DuReaderRobust):
    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))

    DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_dev.json'
    TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_train.json'
    TRIAL_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_trial.json'

    SPLITS = {
        'train': META_INFO(
            os.path.join('cmrc2018_train.json'),
            '7fb714b479c7f40fbb16acabd7af0ede'),
        'dev': META_INFO(
            os.path.join('cmrc2018_dev.json'),
            '853b80709ff2d071f9fce196521b843c'),
        'trial': META_INFO(
            os.path.join('cmrc2018_trial.json'),
            '853b80709ff2d071f9fce196521b843c')
    }

    def _get_data(self, root, mode, **kwargs):
        default_root = os.path.join(DATA_HOME, self.__class__.__name__)

        filename, data_hash = self.SPLITS[mode]
        fullname = os.path.join(default_root,
                                filename) if root is None else os.path.join(
                                    os.path.expanduser(root), filename)
        if not os.path.exists(fullname) or (data_hash and
                                            not md5file(fullname) == data_hash):
            if root is not None:  # not specified, and no need to warn
                warnings.warn(
                    'md5 check failed for {}, download {} data to {}'.format(
                        filename, self.__class__.__name__, default_root))
            if mode == 'train':
                fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root)
            elif mode == 'dev':
                fullname = get_path_from_url(self.DEV_DATA_URL, default_root)
            elif mode == 'trial':
                fullname = get_path_from_url(self.TRIAL_DATA_URL, default_root)
        self.full_path = fullname


class DRCD(DuReaderRobust):
    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))

    DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_dev.json'
    TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_training.json'
    TEST_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_test.json'

    SPLITS = {
        'train': META_INFO(
            os.path.join('DRCD_dev.json'), '7fb714b479c7f40fbb16acabd7af0ede'),
        'dev': META_INFO(
            os.path.join('DRCD_training.json'),
            '853b80709ff2d071f9fce196521b843c'),
        'test': META_INFO(
            os.path.join('DRCD_test.json'), '853b80709ff2d071f9fce196521b843c')
    }

    def _get_data(self, root, mode, **kwargs):
        default_root = os.path.join(DATA_HOME, self.__class__.__name__)

        filename, data_hash = self.SPLITS[mode]
        fullname = os.path.join(default_root,
                                filename) if root is None else os.path.join(
                                    os.path.expanduser(root), filename)
        if not os.path.exists(fullname) or (data_hash and
                                            not md5file(fullname) == data_hash):
            if root is not None:  # not specified, and no need to warn
                warnings.warn(
                    'md5 check failed for {}, download {} data to {}'.format(
                        filename, self.__class__.__name__, default_root))
            if mode == 'train':
                fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root)
            elif mode == 'dev':
                fullname = get_path_from_url(self.DEV_DATA_URL, default_root)
            elif mode == 'test':
                fullname = get_path_from_url(self.TEST_DATA_URL, default_root)
        self.full_path = fullname