# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import six
import sys
import time
import signal
import logging
import itertools
import threading
import numpy as np
import multiprocessing
from collections import namedtuple

# NOTE: queue has a different name in python2 and python3
if six.PY2:
    import Queue as queue
else:
    import queue

import paddle
from .. import core, layers
from ..framework import in_dygraph_mode
from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher

__all__ = ['get_worker_info']

# multi-process worker check indices queue interval, avoid
# hanging in subprocess data loading
MP_INDICES_CHECK_INTERVAL = 5

_IterableDatasetStopIteration = namedtuple('_IterableDatasetStopIteration',
                                           ['worker_id'])


def default_collate_fn(batch):
    """
    Default batch collating function for :code:`fluid.io.DataLoader`,
    batch should be a list of samples, and each sample should be a list
    of fields as follows:
    
    [[filed1, filed2, ...], [filed1, filed2, ...], ...]
    
    This default collate function zipped each filed together and stack
    each filed as the batch field as follows:

    [batch_filed1, batch_filed2, ...]

    Args:  
        batch(list of list of numpy array): the batch data, each fields
              should be a numpy array, each sample should be a list of
              fileds, and batch should be a list of sample.
    
    Returns:
        a list of numpy array: collated batch
    """
    sample = batch[0]
    # dataset has only 1 field
    if isinstance(sample, np.ndarray):
        return [np.stack(batch, axis=0)]

    # batch each field
    slots = []
    for items in batch:
        for i, item in enumerate(items):
            if len(slots) < len(items):
                slots.append([item])
            else:
                slots[i].append(item)

    if isinstance(slots[0][0], np.ndarray):
        return [np.stack(slot, axis=0) for slot in slots]
    elif isinstance(slots[0][0], paddle.Tensor):
        return [layers.stack(slot, axis=0) for slot in slots]
    else:
        raise RuntimeError("Unknown data type {}".format(type(slots[0][0])))


class _DatasetKind(object):
    MAP = 0
    ITER = 1

    @staticmethod
    def create_fetcher(kind, dataset, collate_fn, drop_last):
        if kind == _DatasetKind.MAP:
            return _MapDatasetFetcher(dataset, collate_fn, drop_last)
        elif kind == _DatasetKind.ITER:
            return _IterableDatasetFetcher(dataset, collate_fn, drop_last)
        else:
            raise NotImplementedError("unknown Dataset kind {}".format(kind))


class ParentWatchDog(object):
    def __init__(self):
        self._parent_pid = os.getppid()
        self._parent_alive = True

    def is_alive(self):
        if self._parent_alive:
            self._parent_alive = os.getppid() == self._parent_pid
        return self._parent_alive


# worker information for each workers, used for splitting data copy
# for IteratorDataset in worker processes.
_worker_info = None


def get_worker_info():
    """
    Get DataLoader worker process information function, this function is
    used to split data copy in worker process for IterableDataset
    (see :code:`paddle.io.IterableDataset`), worker information contains
    following fields:

    :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader`

    :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1`

    :attr:`dataset`: the dataset object in this worker process

    Returns:
        WorkerInfo: an instance of WorkerInfo which contains fields above.

    .. note::
        For mode usage and exampls, please see :code:`paddle.io.IterableDataset`

    Example:

        .. code-block:: python

            import math
            import numpy as np
            import paddle.fluid as fluid
            from paddle.io import IterableDataset, DataLoader, get_worker_info

            class SplitedIterableDataset(IterableDataset):
                def __init__(self, start, end):
                    self.start = start
                    self.end = end

                def __iter__(self):
                    worker_info = get_worker_info()
                    if worker_info is None:
                        iter_start = self.start
                        iter_end = self.end
                    else:
                        per_worker = int(
                            math.ceil((self.end - self.start) / float(
                                worker_info.num_workers)))
                        worker_id = worker_info.id
                        iter_start = self.start + worker_id * per_worker
                        iter_end = min(iter_start + per_worker, self.end)

                    for i in range(iter_start, iter_end):
                        yield np.array([i])

            place = fluid.CPUPlace()
            with fluid.dygraph.guard(place):
                dataset = SplitedIterableDataset(start=2, end=9)
                dataloader = DataLoader(
                    dataset,
                    places=place,
                    num_workers=2,
                    batch_size=1,
                    drop_last=True)

                print(list(dataloader))
                # outputs: [2, 5, 3, 6, 4, 7]

    """
    return _worker_info


class WorkerInfo(object):
    __initialized = False

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
        self.__initialized = True

    def __setattr__(self, key, val):
        if self.__initialized:
            raise RuntimeError("Cannot assign attributes to {} objects".format(
                self.__class__.__name__))
        return super(WorkerInfo, self).__setattr__(key, val)


class _DataLoaderIterBase(object):
    """
    Iterator implement of DataLoader, will load and feed mini-batch
    data by setting in given dataloader.

    Args:
        loader(instance of DataLoader): instance of `fluid.io.DataLoader`
    """

    def __init__(self, loader):
        self._dataset = loader.dataset
        self._feed_list = loader.feed_list or []
        self._places = loader.places
        self._return_list = loader.return_list
        self._batch_sampler = loader.batch_sampler
        self._sampler_iter = iter(loader.batch_sampler)
        self._collate_fn = loader.collate_fn or default_collate_fn
        self._num_workers = loader.num_workers
        self._use_buffer_reader = loader.use_buffer_reader
        self._use_shared_memory = loader.use_shared_memory
        self._timeout = loader.timeout if loader.timeout > 0 else MP_INDICES_CHECK_INTERVAL
        self._worker_init_fn = loader.worker_init_fn
        self._dataset_kind = loader.dataset_kind
        self._pin_memory = loader.pin_memory

        # LoDTensorBlockingQueue instance for create_py_reader and a thread
        # to put mini-batch data to self._blocking_queue, mini-batch data
        # will be get from:
        # 1. multi-process mode: get data from workers' result queue
        # 2. single-process mode: read mini-batch data in main process
        self._blocking_queue = None
        self._thread = None
        self._thread_done_event = threading.Event()

    def __iter__(self):
        return self

    def __len__(self):
        return len(self._batch_sampler)


class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
    """
    Single process implement of DataLoaderIter, loading data from
    loader.data in main process
    """

    def __init__(self, loader):
        super(_DataLoaderIterSingleProcess, self).__init__(loader)

        self._dataset_fetcher = _DatasetKind.create_fetcher(
            self._dataset_kind, self._dataset, self._collate_fn, True)

        # NOTE: len(self._places) batch data compose as an output
        # iteration, set blocking_queue can cache 2 iteration datas
        # at most here
        self._blocking_queue_capacity = 2 * len(self._places)

        self._init_thread()

    def _init_thread(self):
        self._var_names = [v.name for v in self._feed_list]
        self._shapes = [v.shape for v in self._feed_list]
        self._dtypes = [v.dtype for v in self._feed_list]
        self._need_check_feed = [
            v.desc.need_check_feed() for v in self._feed_list
        ]
        # if only 1 place, do not need to keep order
        self._blocking_queue = core.init_lod_tensor_blocking_queue(
            core.Variable(), self._blocking_queue_capacity,
            len(self._places) > 1)
        self._reader = core.create_py_reader(
            self._blocking_queue, self._var_names, self._shapes, self._dtypes,
            self._need_check_feed, self._places, self._use_buffer_reader, True,
            self._pin_memory)

        self._thread = threading.Thread(target=self._thread_loop)
        self._thread.daemon = True
        self._thread.start()

    def _thread_loop(self):
        try:
            for indices in self._sampler_iter:
                # read data from dataset in mini-batch
                batch = self._dataset_fetcher.fetch(indices)

                # pack as LoDTensorArray
                array = core.LoDTensorArray()
                for slot in batch:
                    if not isinstance(slot, core.LoDTensor):
                        self._check_input_array(slot)
                        # FIXME(dkp): blocking_queue only support
                        #             core.LoDTensorArray as input now, read
                        #             numpy data into a LoDTensorArray here,
                        #             should support paddle.Tensor list later
                        if isinstance(slot, paddle.Tensor):
                            slot = slot.numpy()
                        tmp = core.LoDTensor()
                        tmp.set(slot, core.CPUPlace())
                        slot = tmp

                    array.append(slot)

                if not self._blocking_queue.push(array):
                    break

            self._blocking_queue.close()
            self._thread = None
        except StopIteration:
            self._blocking_queue.close()
        except Exception:
            self._blocking_queue.kill()
            self._thread = None
            logging.warning("DataLoader reader thread raised an exception.")
            six.reraise(*sys.exc_info())

    @classmethod
    def _check_input_array(cls, item):
        if isinstance(item, paddle.Tensor):
            return
        arr = np.array(item)
        if arr.dtype == np.object:
            raise TypeError((
                "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
                "this means the input data contains nested lists with different lengths. "
                "\n\t* Check the reader function passed to 'decorate_batch_generator'"
                " to locate the data causes this issue.\n\t* Please consider using "
                "'fluid.create_lod_tensor' to convert it to a LoD-Tensor."))

    def __next__(self):
        try:
            if in_dygraph_mode():
                return self._reader.read_next_var_list()
            else:
                if self._return_list:
                    return self._reader.read_next_list()
                else:
                    return self._reader.read_next()
        except StopIteration:
            self._reader.reset()
            six.reraise(*sys.exc_info())

    # python2 compatibility
    def next(self):
        return self.__next__()


class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
    def __init__(self, loader):
        super(_DataLoaderIterMultiProcess, self).__init__(loader)

        assert self._num_workers > 0,  "Multi-process DataLoader " \
                    "invalid num_workers({})".format(self._num_workers)

        # subprocess wrokers' result queue
        self._data_queue = None

        # data get from _data_queue will be reordered by _rcvd_idx
        # for data order keeping, data index not equal _rcvd_idx 
        # will be cached in _task_infos
        self._send_idx = 0
        self._rcvd_idx = 0
        self._batches_outstanding = 0
        self._task_infos = {}

        # indices outstand as _outstanding_capacity at first, and
        # blocking_queue capacity is also _outstanding_capacity.
        # _outstanding_capacity here to make sure each indices_queue
        # has at least 2 indices, and outstanding batch cached
        # output data for at least 2 iterations(Note that len(_places)
        # batches will be composed as an iteration output)
        self._outstanding_capacity = 2 * max(self._num_workers,
                                             len(self._places))

        # see _try_put_indices
        self._thread_lock = threading.Lock()

        # init workers and indices queues and put 2 indices in each indices queue
        self._init_workers()
        for _ in range(self._outstanding_capacity):
            self._try_put_indices()

        self._init_thread()
        self._shutdown = False

    def _init_workers(self):
        # multiprocess worker and indice queue list initial as empty
        self._workers = []
        self._worker_status = []
        self._indices_queues = []
        self._workers_idx_cycle = itertools.cycle(range(self._num_workers))

        # create data_queue for workers
        self._data_queue = multiprocessing.Queue()

        # event for workers and thread, thread event is only need 
        # in multi-processing mode
        self._workers_done_event = multiprocessing.Event()
        self._thread_done_event = threading.Event()

        for i in range(self._num_workers):
            indices_queue = multiprocessing.Queue()
            self._indices_queues.append(indices_queue)
            worker = multiprocessing.Process(
                target=self._worker_loop,
                args=(self._dataset, self._dataset_kind, indices_queue,
                      self._data_queue, self._workers_done_event,
                      self._collate_fn, self._worker_init_fn, i,
                      self._num_workers))
            worker.daemon = True
            worker.start()
            self._workers.append(worker)
            self._worker_status.append(True)

        core._set_process_pids(id(self), tuple(w.pid for w in self._workers))
        _set_SIGCHLD_handler()

    def _clear_and_remove_data_queue(self):
        if self._data_queue is not None:
            while True:
                try:
                    self._data_queue.get_nowait()
                except:
                    self._data_queue.cancel_join_thread()
                    self._data_queue.close()
                    break

    def _init_thread(self):
        self._var_names = [v.name for v in self._feed_list]
        self._shapes = [v.shape for v in self._feed_list]
        self._dtypes = [v.dtype for v in self._feed_list]
        self._need_check_feed = [
            v.desc.need_check_feed() for v in self._feed_list
        ]
        # if only 1 place, do not need to keep order
        self._blocking_queue = core.init_lod_tensor_blocking_queue(
            core.Variable(), self._outstanding_capacity, len(self._places) > 1)
        self._reader = core.create_py_reader(
            self._blocking_queue, self._var_names, self._shapes, self._dtypes,
            self._need_check_feed, self._places, self._use_buffer_reader, True,
            self._pin_memory)

        self._thread_done_event = threading.Event()
        self._thread = threading.Thread(target=self._thread_loop)
        self._thread.daemon = True
        self._thread.start()

    def _shutdown_worker(self, worker_id):
        if self._worker_status[worker_id]:
            self._indices_queues[worker_id].put(None)
            self._worker_status[worker_id] = False

    def _try_shutdown_all(self):
        if not self._shutdown:
            try:
                self._exit_thread_expectedly()
                self._clear_and_remove_data_queue()

                # set _workers_done_event should be set before put None
                # to indices_queue, workers wll exit on reading None from
                # indices_queue
                self._workers_done_event.set()
                for i in range(self._num_workers):
                    self._shutdown_worker(i)

                for w in self._workers:
                    w.join()
                for q in self._indices_queues:
                    q.cancel_join_thread()
                    q.close()
            finally:
                core._erase_process_pids(id(self))
                self._shutdown = True

    def _exit_thread_expectedly(self):
        self._thread_done_event.set()
        self._blocking_queue.close()

    def _exit_thread_unexpectedly(self):
        self._thread_done_event.set()
        self._blocking_queue.kill()
        logging.error("DataLoader reader thread raised an exception!")

    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
                     done_event, collate_fn, init_fn, worker_id, num_workers):
        try:
            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
            # some shared memory objects may have been applied for but have not yet
            # been put into the inter-process Queue. This part of the object needs
            # to be cleaned up when the process ends.
            CleanupFuncRegistrar.register(_cleanup_mmap)

            # set signal handler
            core._set_process_signal_handler()

            global _worker_info
            _worker_info = WorkerInfo(
                id=worker_id, num_workers=num_workers, dataset=dataset)

            init_exception = None
            try:
                if init_fn is not None:
                    init_fn(worker_id)
                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
                                                      collate_fn, True)
            except:
                init_exception = Exception("init_fn failed in worker {}: " \
                                     "{}".format(worker_id, sys.exc_info()))

            iterator_drained = False
            parent_watch_dog = ParentWatchDog()

            while parent_watch_dog.is_alive():
                try:
                    data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
                except queue.Empty:
                    continue

                # None as poison piil, so worker event should be set
                if data is None:
                    assert done_event.is_set() or iterator_drained, \
                            "get None when worker done_event set"
                    break
                # If worker done event is set but get still get data in
                # indices_queue, remaining data should be get and skipped.
                if done_event.is_set() or iterator_drained:
                    continue

                idx, indices = data
                try:
                    if init_exception is not None:
                        batch = init_exception
                        init_exception = None
                    else:
                        batch = fetcher.fetch(indices)
                except Exception as e:
                    if isinstance(
                            e,
                            StopIteration) and dataset_kind == _DatasetKind.ITER:
                        out_queue.put(_IterableDatasetStopIteration(worker_id))
                        iterator_drained = True
                    else:
                        out_queue.put((idx, e))
                else:
                    if self._use_shared_memory:
                        # FIXME(dkp): _convert_to_tensor_list only support np.array
                        #             list now, should support paddle.Tensor list
                        if isinstance(batch[0][0], paddle.Tensor):
                            np_batch = []
                            for sample in batch:
                                np_batch.append([s.numpy() for s in sample])
                            batch = np_batch

                        tensor_list = core._convert_to_tensor_list(batch)
                        out_queue.put((idx, tensor_list))
                        core._remove_tensor_list_mmap_fds(tensor_list)
                    else:
                        out_queue.put((idx, batch))
        except KeyboardInterrupt:
            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
            pass
        except:
            six.reraise(*sys.exc_info())
        finally:
            if self._use_shared_memory:
                _cleanup_mmap()

    def _thread_loop(self):
        while not self._thread_done_event.is_set():
            batch = self._get_data()
            if not self._thread_done_event.is_set():
                if batch is None:
                    self._exit_thread_expectedly()
                elif isinstance(batch, Exception):
                    self._exit_thread_unexpectedly()
                else:
                    try:
                        # pack as LoDTensorArray
                        array = core.LoDTensorArray()
                        if self._use_shared_memory:
                            for tensor in batch:
                                array.append(tensor)
                        else:
                            # LoDTensor not in shared memory is not
                            # serializable, cannot be create in workers
                            for slot in batch:
                                if not isinstance(slot, core.LoDTensor):
                                    tmp = core.LoDTensor()
                                    tmp.set(slot, core.CPUPlace())
                                    slot = tmp
                                array.append(slot)

                        if not self._blocking_queue.push(array):
                            self._blocking_queue.close()
                    except:
                        self._exit_thread_unexpectedly()
                        six.reraise(*sys.exc_info())
                    finally:
                        self._rcvd_idx += 1

    def _get_data(self):
        while not self._thread_done_event.is_set():
            # For IterableDataset, batch indices is generated infinitely
            # for each worker to raise StopIteration, but a StopIteration
            # raising process will discard a batch indices which is count
            # in _send_idx but will not increase _rcvd_idx, so we check 
            # whether the worker is still alive here to skip the discarded
            # batch indices and increase _rcvd_idx
            while self._rcvd_idx < self._send_idx:
                info = self._task_infos[self._rcvd_idx]
                if len(info) == 2 or self._worker_status[info[0]]:
                    break
                del self._task_infos[self._rcvd_idx]
                self._rcvd_idx += 1
                self._batches_outstanding -= 1
            else:
                # NOTE: _rcvd_idx and _send_idx only record batches among
                #       workers, if batches among workers drained, there
                #       may also be data in blocking queue
                if self._batches_outstanding < len(self._places):
                    return None
                continue

            if len(self._task_infos[self._rcvd_idx]) == 2:
                return self._task_infos.pop(self._rcvd_idx)[1]

            try:
                # [ avoid hang ]: main process may blocking at _reader.read_next when
                # KeyboardInterrupt, we do following tradeoff:
                # 1. get data with timeout, MP_INDICES_CHECK_INTERVAL(5s) as timeout
                #    default, if KeyboardInterrupt blocking, failed workers will be
                #    checked and raise RuntimeError to quit DataLoader in timeout
                #    exception handling.
                # 2. if get data timeout and check workers all alive, continue to
                #    get data again
                data = self._data_queue.get(timeout=self._timeout)
            except Exception as e:
                # check if thread done event set when waiting data
                if self._thread_done_event.is_set():
                    continue

                # check failed workers
                failed_workers = []
                for i, w in enumerate(self._workers):
                    if self._worker_status[i] and not w.is_alive():
                        failed_workers.append(w)
                        self._shutdown_worker(i)
                if len(failed_workers) > 0:
                    self._exit_thread_unexpectedly()
                    pids = ', '.join(str(w.pid) for w in failed_workers)
                    raise RuntimeError("DataLoader {} workers exit unexpectedly, " \
                                "pids: {}".format(len(failed_workers), pids))

                # get(timeout) will call _poll(timeout) and may raise IOError
                if isinstance(e, queue.Empty) or isinstance(e, IOError):
                    # continue on timeout to keep getting data from queue
                    continue

                self._exit_thread_unexpectedly()
                logging.error("DataLoader reader thread failed({}) to read data from " \
                              "workers' result queue.".format(e))
                six.reraise(*sys.exc_info())
            else:
                if self._dataset_kind == _DatasetKind.ITER and isinstance(
                        data, _IterableDatasetStopIteration):
                    # if a worker get StopIteraion, we shutdown this worker,
                    # note that this batch indices to trigger StopIteration
                    # is discard, outstanding batch number should be decrease
                    # and another indices should be put for other workers
                    # may still working.
                    self._shutdown_worker(data.worker_id)
                    self._batches_outstanding -= 1
                    self._try_put_indices()
                    continue

                idx, batch = data
                if idx == self._rcvd_idx:
                    del self._task_infos[idx]
                    return batch
                else:
                    self._task_infos[idx] += (batch, )
                    continue

    def _try_put_indices(self):
        assert self._batches_outstanding <= self._outstanding_capacity, \
                    "too many indices have been put to queue"
        # In multi-process mode for IterableDataset, _try_put_indices will
        # be called both in main process(for our implement has blocking queue,
        # and blocking queue read is in main process) and thread, which may
        # cause error following error
        #   1. "ValueError: generator already executing" in next(self._sampler_iter)
        #   2. re-enter in increase _send_idx
        # add a lock for threading save, for _try_put_indices is only a slight
        # function which is not in data reading pipeline, this lock almost no
        # influence on performance
        with self._thread_lock:
            try:
                indices = next(self._sampler_iter)
            except StopIteration:
                return

            for i in range(self._num_workers):
                worker_idx = next(self._workers_idx_cycle)
                if self._worker_status[worker_idx]:
                    break
            else:
                return

            self._indices_queues[worker_idx].put((self._send_idx, indices))
            self._task_infos[self._send_idx] = (worker_idx, )
            self._batches_outstanding += 1
            self._send_idx += 1

    def __del__(self):
        self._try_shutdown_all()

    def __next__(self):
        try:
            # _batches_outstanding here record the total batch data number
            # in 'from after _try_put_indices to beforeoutput data', this
            # value should be _outstanding_capacity if data is not drained,
            # if _batches_outstanding is less than _places number, there are
            # no enough data to generate next output, close blocking_queue and
            # set _thread_done_event here, py_reader will raise StopIteration,
            # end workers and indices_queues in StopIteration handling
            if self._batches_outstanding < len(self._places):
                self._thread_done_event.set()
                self._blocking_queue.close()

            if in_dygraph_mode():
                data = self._reader.read_next_var_list()
            else:
                if self._return_list:
                    data = self._reader.read_next_list()
                    # static graph organized data on multi-device with list, if
                    # place number is 1, there is only 1 device, extra the data
                    # from list for devices to be compatible with dygraph mode
                    if len(self._places) == 1:
                        data = data[0]
                else:
                    data = self._reader.read_next()
            self._on_output_batch()
            return data
        except StopIteration:
            self._reader.reset()
            self._try_shutdown_all()
            six.reraise(*sys.exc_info())

    # python2 compatibility
    def next(self):
        return self.__next__()

    def _on_output_batch(self):
        for _ in range(len(self._places)):
            self._batches_outstanding -= 1
            self._try_put_indices()