# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import division

import os

from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
from nvidia.dali.plugin.paddle import DALIGenericIterator

import paddle
from paddle import fluid


class HybridTrainPipe(Pipeline):
    def __init__(self,
                 file_root,
                 file_list,
                 batch_size,
                 resize_shorter,
                 crop,
                 min_area,
                 lower,
                 upper,
                 interp,
                 mean,
                 std,
                 device_id,
                 shard_id=0,
                 num_shards=1,
                 random_shuffle=True,
                 num_threads=4,
                 seed=42,
                 pad_output=False,
                 output_dtype=types.FLOAT):
        super(HybridTrainPipe, self).__init__(
            batch_size, num_threads, device_id, seed=seed)
        self.input = ops.FileReader(
            file_root=file_root,
            file_list=file_list,
            shard_id=shard_id,
            num_shards=num_shards,
            random_shuffle=random_shuffle)
        # set internal nvJPEG buffers size to handle full-sized ImageNet images
        # without additional reallocations
        device_memory_padding = 211025920
        host_memory_padding = 140544512
        self.decode = ops.ImageDecoderRandomCrop(
            device='mixed',
            output_type=types.RGB,
            device_memory_padding=device_memory_padding,
            host_memory_padding=host_memory_padding,
            random_aspect_ratio=[lower, upper],
            random_area=[min_area, 1.0],
            num_attempts=100)
        self.res = ops.Resize(
            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
        self.cmnp = ops.CropMirrorNormalize(
            device="gpu",
            output_dtype=output_dtype,
            output_layout=types.NCHW,
            crop=(crop, crop),
            image_type=types.RGB,
            mean=mean,
            std=std,
            pad_output=pad_output)
        self.coin = ops.CoinFlip(probability=0.5)
        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")

    def define_graph(self):
        rng = self.coin()
        jpegs, labels = self.input(name="Reader")
        images = self.decode(jpegs)
        images = self.res(images)
        output = self.cmnp(images.gpu(), mirror=rng)
        return [output, self.to_int64(labels.gpu())]

    def __len__(self):
        return self.epoch_size("Reader")


class HybridValPipe(Pipeline):
    def __init__(self,
                 file_root,
                 file_list,
                 batch_size,
                 resize_shorter,
                 crop,
                 interp,
                 mean,
                 std,
                 device_id,
                 shard_id=0,
                 num_shards=1,
                 random_shuffle=False,
                 num_threads=4,
                 seed=42,
                 pad_output=False,
                 output_dtype=types.FLOAT):
        super(HybridValPipe, self).__init__(
            batch_size, num_threads, device_id, seed=seed)
        self.input = ops.FileReader(
            file_root=file_root,
            file_list=file_list,
            shard_id=shard_id,
            num_shards=num_shards,
            random_shuffle=random_shuffle)
        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
        self.res = ops.Resize(
            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
        self.cmnp = ops.CropMirrorNormalize(
            device="gpu",
            output_dtype=output_dtype,
            output_layout=types.NCHW,
            crop=(crop, crop),
            image_type=types.RGB,
            mean=mean,
            std=std,
            pad_output=pad_output)
        self.to_int64 = ops.Cast(dtype=types.INT64, device="gpu")

    def define_graph(self):
        jpegs, labels = self.input(name="Reader")
        images = self.decode(jpegs)
        images = self.res(images)
        output = self.cmnp(images)
        return [output, self.to_int64(labels.gpu())]

    def __len__(self):
        return self.epoch_size("Reader")


def build(settings, mode='train'):
    env = os.environ
    assert settings.use_gpu, "gpu training is required for DALI"
    assert not settings.use_mixup, "mixup is not supported by DALI reader"
    assert not settings.use_aa, "auto augment is not supported by DALI reader"
    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

    file_root = settings.data_dir
    bs = settings.batch_size
    assert bs % paddle.fluid.core.get_cuda_device_count() == 0, \
        "batch size must be multiple of number of devices"
    batch_size = bs // paddle.fluid.core.get_cuda_device_count()

    mean = [v * 255 for v in settings.image_mean]
    std = [v * 255 for v in settings.image_std]
    crop = settings.image_shape[1]
    resize_shorter = settings.resize_short_size
    min_area = settings.lower_scale
    lower = settings.lower_ratio
    upper = settings.upper_ratio
    output_dtype = types.FLOAT16 if (settings.use_amp and settings.use_pure_fp16) else types.FLOAT

    interp = settings.interpolation or 1  # default to linear
    interp_map = {
        0: types.INTERP_NN,  # cv2.INTER_NEAREST
        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }
    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]
    pad_output = False
    if settings.image_shape[0] == 4:
        pad_output = True

    if mode != 'train':
        p = fluid.framework.cuda_places()[0]
        place = fluid.core.Place()
        place.set_place(p)
        device_id = place.gpu_device_id()
        file_list = os.path.join(file_root, 'val_list.txt')
        if not os.path.exists(file_list):
            file_list = None
            file_root = os.path.join(file_root, 'val')
        pipe = HybridValPipe(
            file_root,
            file_list,
            batch_size,
            resize_shorter,
            crop,
            interp,
            mean,
            std,
            device_id=device_id,
            pad_output=pad_output,
            output_dtype=output_dtype)
        pipe.build()
        return DALIGenericIterator(
            pipe, ['feed_image', 'feed_label'],
            size=len(pipe),
            dynamic_shape=True,
            fill_last_batch=True,
            last_batch_padded=True)

    file_list = os.path.join(file_root, 'train_list.txt')
    if not os.path.exists(file_list):
        file_list = None
        file_root = os.path.join(file_root, 'train')

    if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
        shard_id = int(env['PADDLE_TRAINER_ID'])
        num_shards = int(env['PADDLE_TRAINERS_NUM'])
        device_id = int(env['FLAGS_selected_gpus'])
        pipe = HybridTrainPipe(
            file_root,
            file_list,
            batch_size,
            resize_shorter,
            crop,
            min_area,
            lower,
            upper,
            interp,
            mean,
            std,
            device_id,
            shard_id,
            num_shards,
            seed=42 + shard_id,
            pad_output=pad_output,
            output_dtype=output_dtype)
        pipe.build()
        pipelines = [pipe]
        sample_per_shard = len(pipe) // num_shards
    else:
        pipelines = []
        places = fluid.framework.cuda_places()
        num_shards = len(places)
        for idx, p in enumerate(places):
            place = fluid.core.Place()
            place.set_place(p)
            device_id = place.gpu_device_id()
            pipe = HybridTrainPipe(
                file_root,
                file_list,
                batch_size,
                resize_shorter,
                crop,
                min_area,
                lower,
                upper,
                interp,
                mean,
                std,
                device_id,
                idx,
                num_shards,
                seed=42 + idx,
                pad_output=pad_output,
                output_dtype=output_dtype)
            pipe.build()
            pipelines.append(pipe)
        sample_per_shard = len(pipelines[0])

    return DALIGenericIterator(
        pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)


def train(settings):
    return build(settings, 'train')


def val(settings):
    return build(settings, 'val')