preprocess.py

# encoding=utf-8
import os
import sys


def build_word_dict():
    word_file = "word_dict.txt"
    f = open(word_file, "r")
    lines = f.readlines()
    word_list_ids = range(1, len(lines) + 1)
    word_dict = dict(zip([word.strip() for word in lines], word_list_ids))
    f.close()
    return word_dict


def build_token_data(word_dict, txt_file, token_file):
    max_text_size = 100

    f = open(txt_file, "r")
    fout = open(token_file, "w")
    lines = f.readlines()
    i = 0

    for line in lines:
        line = line.strip("\n").split("\t")
        text = line[0].strip("\n").split(" ")
        tokens = []
        label = line[1]
        for word in text:
            if word in word_dict:
                tokens.append(str(word_dict[word]))
            else:
                tokens.append("0")

        seg_len = len(tokens)
        if seg_len < 5:
            continue
        if seg_len >= max_text_size:
            tokens = tokens[:max_text_size]
            seg_len = max_text_size
        else:
            tokens = tokens + ["0"] * (max_text_size - seg_len)
        text_tokens = " ".join(tokens)
        fout.write(text_tokens + " " + str(seg_len) + " " + label + "\n")
        if (i + 1) % 100 == 0:
            print(str(i + 1) + " lines OK")
        i += 1

    fout.close()
    f.close()


word_dict = build_word_dict()

txt_file = "test.tsv"
token_file = "test.txt"
build_token_data(word_dict, txt_file, token_file)

txt_file = "dev.tsv"
token_file = "dev.txt"
build_token_data(word_dict, txt_file, token_file)

txt_file = "train.tsv"
token_file = "train.txt"
build_token_data(word_dict, txt_file, token_file)