# -*- coding: utf-8 -*

import re
import argparse


def parse_args():
    parser = argparse.ArgumentParser(
        description="Paddle Fluid word2 vector preprocess")
    parser.add_argument(
        '--data_path',
        type=str,
        required=True,
        help="The path of training dataset")
    parser.add_argument(
        '--dict_path',
        type=str,
        default='./dict',
        help="The path of generated dict")
    parser.add_argument(
        '--freq',
        type=int,
        default=5,
        help="If the word count is less then freq, it will be removed from dict")

    return parser.parse_args()


def preprocess(data_path, dict_path, freq):
    """
    proprocess the data, generate dictionary and save into dict_path.
    :param data_path: the input data path.
    :param dict_path: the generated dict path. the data in dict is "word count"
    :param freq:
    :return:
    """
    # word to count
    word_count = dict()

    with open(data_path) as f:
        for line in f:
            line = line.lower()
            line = re.sub("[^a-z ]", "", line)
            words = line.split()
            for item in words:
                if item in word_count:
                    word_count[item] = word_count[item] + 1
                else:
                    word_count[item] = 1
    item_to_remove = []
    for item in word_count:
        if word_count[item] <= freq:
            item_to_remove.append(item)
    for item in item_to_remove:
        del word_count[item]

    with open(dict_path, 'w+') as f:
        for k, v in word_count.items():
            f.write(str(k) + " " + str(v) + '\n')


if __name__ == "__main__":
    args = parse_args()
    preprocess(args.data_path, args.dict_path, args.freq)