# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data preprocessing for DBLP dataset"""
import sys
import os
import argparse
import numpy as np
from collections import OrderedDict

AUTHOR = 14475
PAPER = 14376
CONF = 20
TYPE = 8920
LABEL = 4


def build_node_types(meta_node, outfile):
    """build_node_types"""
    nt_ori2new = {}
    with open(outfile, 'w') as writer:
        offset = 0
        for node_type, num_nodes in meta_node.items():
            ori_id2new_id = {}
            for i in range(num_nodes):
                writer.write("%d\t%s\n" % (offset + i, node_type))
                ori_id2new_id[i + 1] = offset + i
            nt_ori2new[node_type] = ori_id2new_id
            offset += num_nodes
    return nt_ori2new


def remapping_index(args, src_dict, dst_dict, ori_file, new_file):
    """remapping_index"""
    ori_file = os.path.join(args.data_path, ori_file)
    new_file = os.path.join(args.output_path, new_file)
    with open(ori_file, 'r') as reader, open(new_file, 'w') as writer:
        for line in reader:
            slots = line.strip().split()
            s = int(slots[0])
            d = int(slots[1])
            new_s = src_dict[s]
            new_d = dst_dict[d]
            writer.write("%d\t%d\n" % (new_s, new_d))


def author_label(args, ori_id2pgl_id, ori_file, real_file, new_file):
    """author_label"""
    ori_file = os.path.join(args.data_path, ori_file)
    real_file = os.path.join(args.data_path, real_file)
    new_file = os.path.join(args.output_path, new_file)
    real_id2pgl_id = {}
    with open(ori_file, 'r') as reader:
        for line in reader:
            slots = line.strip().split()
            ori_id = int(slots[0])
            real_id = int(slots[1])
            pgl_id = ori_id2pgl_id[ori_id]
            real_id2pgl_id[real_id] = pgl_id

    with open(real_file, 'r') as reader, open(new_file, 'w') as writer:
        for line in reader:
            slots = line.strip().split()
            real_id = int(slots[0])
            label = int(slots[1])
            pgl_id = real_id2pgl_id[real_id]
            writer.write("%d\t%d\n" % (pgl_id, label))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='DBLP data preprocessing')
    parser.add_argument(
        '--data_path',
        default=None,
        type=str,
        help='original data path(default: None)')
    parser.add_argument(
        '--output_path',
        default=None,
        type=str,
        help='output path(default: None)')
    args = parser.parse_args()

    meta_node = OrderedDict()
    meta_node['a'] = AUTHOR
    meta_node['p'] = PAPER
    meta_node['c'] = CONF
    meta_node['t'] = TYPE

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    node_types_file = os.path.join(args.output_path, "node_types.txt")
    nt_ori2new = build_node_types(meta_node, node_types_file)

    remapping_index(args, nt_ori2new['p'], nt_ori2new['a'], 'paper_author.dat',
                    'paper_author.txt')
    remapping_index(args, nt_ori2new['p'], nt_ori2new['c'],
                    'paper_conference.dat', 'paper_conference.txt')
    remapping_index(args, nt_ori2new['p'], nt_ori2new['t'], 'paper_type.dat',
                    'paper_type.txt')

    author_label(args, nt_ori2new['a'], 'author_map_id.dat',
                 'author_label.dat', 'author_label.txt')