data_utils.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""data processing
"""
import numpy as np
import pandas as pd

from utils.math_utils import z_score


class Dataset(object):
    """Dataset
    """

    def __init__(self, data, stats):
        self.__data = data
        self.mean = stats['mean']
        self.std = stats['std']

    def get_data(self, type):  # type: train, val or test
        return self.__data[type]

    def get_stats(self):
        return {'mean': self.mean, 'std': self.std}

    def get_len(self, type):
        return len(self.__data[type])

    def z_inverse(self, type):
        return self.__data[type] * self.std + self.mean


def seq_gen(len_seq, data_seq, offset, n_frame, n_route, day_slot, C_0=1):
    """Generate data in the form of standard sequence unit."""
    n_slot = day_slot - n_frame + 1

    tmp_seq = np.zeros((len_seq * n_slot, n_frame, n_route, C_0))
    for i in range(len_seq):
        for j in range(n_slot):
            sta = (i + offset) * day_slot + j
            end = sta + n_frame
            tmp_seq[i * n_slot + j, :, :, :] = np.reshape(
                data_seq[sta:end, :], [n_frame, n_route, C_0])
    return tmp_seq


def adj_matrx_gen_custom(input_file, city_file):
    """genenrate Adjacency Matrix from file 
    """
    print("generate adj_matrix data (take long time)...")
    # data
    df = pd.read_csv(
        input_file,
        sep='\t',
        names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数'])
    # 只需要2020年的数据
    df['date'] = pd.to_datetime(df['date'], format="%Y%m%d")
    df = df.set_index('date')
    df = df['2020']
    city_df = pd.read_csv(city_file)
    # 剔除武汉
    city_df = city_df.drop(0)
    num = len(city_df)
    matrix = np.zeros([num, num])
    for i in city_df['city']:
        for j in city_df['city']:
            if (i == j):
                continue
            # 选出从i到j的每日人数
            cut = df[df['迁出城市'].str.contains(i)]
            cut = cut[cut['迁入城市'].str.contains(j)]
            # 求均值作为权重
            average = cut['人数'].mean()
            # 赋值给matrix
            i_index = int(city_df[city_df['city'] == i]['num']) - 1
            j_index = int(city_df[city_df['city'] == j]['num']) - 1
            matrix[i_index, j_index] = average

    np.savetxt("dataset/W_74.csv", matrix, delimiter=",")


def data_gen_custom(input_file, output_file, city_file, n, n_his, n_pred,
                    n_config):
    """data_gen_custom"""
    print("generate training data...")
    # data
    df = pd.read_csv(
        input_file,
        sep='\t',
        names=['date', '迁出省份', '迁出城市', '迁入省份', '迁入城市', '人数'])
    # 只需要2020年的数据
    df['date'] = pd.to_datetime(df['date'], format="%Y%m%d")
    df = df.set_index('date')
    df = df['2020']
    city_df = pd.read_csv(city_file)
    input_df = pd.DataFrame()

    out_df_wuhan = df[df['迁出城市'].str.contains('武汉')]
    for i in city_df['city']:
        # 筛选迁入城市
        in_df_i = out_df_wuhan[out_df_wuhan['迁入城市'].str.contains(i)]
        # 确保按时间升序
        # in_df_i.sort_values("date",inplace=True)
        # 按时间插入
        in_df_i.reset_index(drop=True, inplace=True)
        input_df[i] = in_df_i['人数']

    # 替换Nan值
    input_df = input_df.replace(np.nan, 0)

    x = input_df
    y = pd.read_csv(output_file)
    # 删除第1列
    x.drop(
        x.columns[x.columns.str.contains(
            'unnamed', case=False)],
        axis=1,
        inplace=True)
    y = y.drop(columns=['date'])

    # 剔除迁入武汉的数据
    x = x.drop(columns=['武汉'])
    y = y.drop(columns=['武汉'])

    # param
    n_val, n_test = n_config
    n_train = len(y) - n_val - n_test - 2

    # (?,26,74,1)
    df = pd.DataFrame(columns=x.columns)
    for i in range(len(y) - n_pred + 1):
        df = df.append(x[i:i + n_his])
        df = df.append(y[i:i + n_pred])
    data = df.values.reshape(-1, n_his + n_pred, n,
                             1)  # n == num_nodes == city num

    x_stats = {'mean': np.mean(data), 'std': np.std(data)}

    x_train = data[:n_train]
    x_val = data[n_train:n_train + n_val]
    x_test = data[n_train + n_val:]

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    print("generate successfully!")

    return dataset


def data_gen_mydata(input_file, label_file, n, n_his, n_pred, n_config):
    """data processing
    """
    # data
    x = pd.read_csv(input_file)
    y = pd.read_csv(label_file)
    x = x.drop(columns=['date'])
    y = y.drop(columns=['date'])

    x = x.drop(columns=['武汉'])
    y = y.drop(columns=['武汉'])

    # param
    n_val, n_test = n_config
    n_train = len(y) - n_val - n_test - 2

    # (?,26,74,1)
    df = pd.DataFrame(columns=x.columns)
    for i in range(len(y) - n_pred + 1):
        df = df.append(x[i:i + n_his])
        df = df.append(y[i:i + n_pred])

    data = df.values.reshape(-1, n_his + n_pred, n, 1)

    x_stats = {'mean': np.mean(data), 'std': np.std(data)}

    x_train = data[:n_train]
    x_val = data[n_train:n_train + n_val]
    x_test = data[n_train + n_val:]

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset


def data_gen(file_path, data_config, n_route, n_frame=21, day_slot=288):
    """Source file load and dataset generation."""
    n_train, n_val, n_test = data_config
    # generate training, validation and test data
    try:
        data_seq = pd.read_csv(file_path, header=None).values
    except FileNotFoundError:
        print(f'ERROR: input file was not found in {file_path}.')

    seq_train = seq_gen(n_train, data_seq, 0, n_frame, n_route, day_slot)
    seq_val = seq_gen(n_val, data_seq, n_train, n_frame, n_route, day_slot)
    seq_test = seq_gen(n_test, data_seq, n_train + n_val, n_frame, n_route,
                       day_slot)

    # x_stats: dict, the stats for the train dataset, including the value of mean and standard deviation.
    x_stats = {'mean': np.mean(seq_train), 'std': np.std(seq_train)}

    # x_train, x_val, x_test: np.array, [sample_size, n_frame, n_route, channel_size].
    x_train = z_score(seq_train, x_stats['mean'], x_stats['std'])
    x_val = z_score(seq_val, x_stats['mean'], x_stats['std'])
    x_test = z_score(seq_test, x_stats['mean'], x_stats['std'])

    x_data = {'train': x_train, 'val': x_val, 'test': x_test}
    dataset = Dataset(x_data, x_stats)
    return dataset


def gen_batch(inputs, batch_size, dynamic_batch=False, shuffle=False):
    """Data iterator in batch.

    Args:
        inputs: np.ndarray, [len_seq, n_frame, n_route, C_0], standard sequence units.
        batch_size: int, size of batch.
        dynamic_batch: bool, whether changes the batch size in the last batch 
            if its length is less than the default.
        shuffle: bool, whether shuffle the batches.
    """
    len_inputs = len(inputs)

    if shuffle:
        idx = np.arange(len_inputs)
        np.random.shuffle(idx)

    for start_idx in range(0, len_inputs, batch_size):
        end_idx = start_idx + batch_size
        if end_idx > len_inputs:
            if dynamic_batch:
                end_idx = len_inputs
            else:
                break
        if shuffle:
            slide = idx[start_idx:end_idx]
        else:
            slide = slice(start_idx, end_idx)

        yield inputs[slide]