data_preparation.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import io
import args
import pandas as pd
from sklearn import preprocessing


def _clean_file(source_path, target_path):
    """makes changes to match the CSV format."""
    with io.open(source_path, 'r') as temp_eval_file:
        with io.open(target_path, 'w') as eval_file:
            for line in temp_eval_file:
                line = line.strip()
                line = line.replace(', ', ',')
                if not line or ',' not in line:
                    continue
                if line[-1] == '.':
                    line = line[:-1]
                line += '\n'
                eval_file.write(line)


def build_model_columns(train_data_path, test_data_path):
    # The column names are from
    # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
    column_names = [
        'age', 'workclass', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'race', 'gender',
        'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
        'income_bracket'
    ]

    # Load the dataset in Pandas
    train_df = pd.read_csv(
        train_data_path,
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names)
    test_df = pd.read_csv(
        test_data_path,
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names)

    # First group of tasks according to the paper
    #label_columns = ['income_50k', 'marital_stat']
    categorical_columns = [
        'education', 'marital_status', 'relationship', 'workclass',
        'occupation'
    ]
    for col in categorical_columns:
        label_train = preprocessing.LabelEncoder()
        train_df[col] = label_train.fit_transform(train_df[col])
        label_test = preprocessing.LabelEncoder()
        test_df[col] = label_test.fit_transform(test_df[col])

    bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
    train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(),
                                     bins,
                                     labels=False)
    test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(),
                                    bins,
                                    labels=False)

    base_columns = [
        'education', 'marital_status', 'relationship', 'workclass',
        'occupation', 'age_buckets'
    ]

    train_df['education_occupation'] = train_df['education'].astype(
        str) + '_' + train_df['occupation'].astype(str)
    test_df['education_occupation'] = test_df['education'].astype(
        str) + '_' + test_df['occupation'].astype(str)
    train_df['age_buckets_education_occupation'] = train_df[
        'age_buckets'].astype(str) + '_' + train_df['education'].astype(
            str) + '_' + train_df['occupation'].astype(str)
    test_df['age_buckets_education_occupation'] = test_df[
        'age_buckets'].astype(str) + '_' + test_df['education'].astype(
            str) + '_' + test_df['occupation'].astype(str)
    crossed_columns = [
        'education_occupation', 'age_buckets_education_occupation'
    ]

    for col in crossed_columns:
        label_train = preprocessing.LabelEncoder()
        train_df[col] = label_train.fit_transform(train_df[col])
        label_test = preprocessing.LabelEncoder()
        test_df[col] = label_test.fit_transform(test_df[col])

    wide_columns = base_columns + crossed_columns

    train_df_temp = pd.get_dummies(
        train_df[categorical_columns], columns=categorical_columns)
    test_df_temp = pd.get_dummies(
        test_df[categorical_columns], columns=categorical_columns)
    train_df = train_df.join(train_df_temp)
    test_df = test_df.join(test_df_temp)

    deep_columns = list(train_df_temp.columns) + [
        'age', 'education_num', 'capital_gain', 'capital_loss',
        'hours_per_week'
    ]

    train_df['label'] = train_df['income_bracket'].apply(
        lambda x: 1 if x == '>50K' else 0)
    test_df['label'] = test_df['income_bracket'].apply(
        lambda x: 1 if x == '>50K' else 0)

    with io.open('train_data/columns.txt', 'w') as f:
        write_str = str(len(wide_columns)) + '\n' + str(len(
            deep_columns)) + '\n'
        f.write(write_str)
        f.close()
    with io.open('test_data/columns.txt', 'w') as f:
        write_str = str(len(wide_columns)) + '\n' + str(len(
            deep_columns)) + '\n'
        f.write(write_str)
        f.close()

    train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
        train_data_path, index=False)
    test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(
        test_data_path, index=False)


def clean_file(train_path, test_path, train_data_path, test_data_path):
    _clean_file(train_path, train_data_path)
    _clean_file(test_path, test_data_path)


if __name__ == '__main__':
    args = args.parse_args()
    clean_file(args.train_path, args.test_path, args.train_data_path,
               args.test_data_path)
    build_model_columns(args.train_data_path, args.test_data_path)