# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
import six
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import contextlib
from functools import partial

import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable
from paddle.fluid.io import DataLoader
from paddle.fluid.dygraph_grad_clip import GradClipByGlobalNorm

import reader
from args import parse_args
from seq2seq_base import BaseModel, CrossEntropyCriterion
from seq2seq_attn import AttentionModel
from model import Input, set_device
from callbacks import ProgBarLogger
from metrics import Metric


class PPL(Metric):
    pass


def do_train(args):
    device = set_device("gpu" if args.use_gpu else "cpu")
    fluid.enable_dygraph(device)  #if args.eager_run else None

    # define model
    inputs = [
        Input(
            [None, None], "int64", name="src_word"),
        Input(
            [None], "int64", name="src_length"),
        Input(
            [None, None], "int64", name="trg_word"),
        Input(
            [None], "int64", name="trg_length"),
    ]
    labels = [Input([None, None, 1], "int64", name="label"), ]

    model = AttentionModel(args.src_vocab_size, args.tar_vocab_size,
                           args.hidden_size, args.hidden_size, args.num_layers,
                           args.dropout)

    model.prepare(
        fluid.optimizer.Adam(
            learning_rate=args.learning_rate,
            parameter_list=model.parameters()),
        CrossEntropyCriterion(),
        inputs=inputs,
        labels=labels)

    batch_size = 32
    src_seq_len = 10
    trg_seq_len = 12
    iter_num = 10

    def random_generator():
        for i in range(iter_num):
            src = np.random.randint(2, args.src_vocab_size,
                                    (batch_size, src_seq_len)).astype("int64")
            src_length = np.random.randint(1, src_seq_len,
                                           (batch_size, )).astype("int64")
            trg = np.random.randint(2, args.tar_vocab_size,
                                    (batch_size, trg_seq_len)).astype("int64")
            trg_length = np.random.randint(1, trg_seq_len,
                                           (batch_size, )).astype("int64")
            label = np.random.randint(
                1, trg_seq_len, (batch_size, trg_seq_len, 1)).astype("int64")
            yield src, src_length, trg, trg_length, label

    model.fit(train_data=random_generator, log_freq=1)
    exit(0)

    data_loaders = [None, None]
    data_files = [args.training_file, args.validation_file
                  ] if args.validation_file else [args.training_file]
    train_loader, eval_loader = data_loaders

    model.fit(train_data=train_loader,
              eval_data=None,
              epochs=1,
              eval_freq=1,
              save_freq=1,
              verbose=2)


if __name__ == "__main__":
    args = parse_args()
    do_train(args)