import logging import os import numpy as np from collections import namedtuple from tqdm import tqdm def read_files(dir_path): """ :param dir_path """ examples = [] Example = namedtuple('Example', ['qid', 'text_a', 'label', 'score']) def _read_files(dir_p, label): logging.info('loading data from %s' % dir_p) data_files = os.listdir(dir_p) desc = "loading " + dir_p for f_idx, data_file in tqdm(enumerate(data_files), desc=desc): file_path = os.path.join(dir_p, data_file) qid, score = data_file.split('_') score = score.split('.')[0] with open(file_path, 'r') as f: doc = [] for line in f: line = line.strip().replace('

', ' ') doc.append(line) doc_text = ' '.join(doc) example = Example( qid=len(examples)+1, text_a=doc_text, label=label, score=score ) examples.append(example) neg_dir = os.path.join(dir_path, 'neg') pos_dir = os.path.join(dir_path, 'pos') _read_files(neg_dir, label=0) _read_files(pos_dir, label=1) logging.info('loading data finished') return examples def write_to_one(dir, o_file_name): exampels = read_files(dir) logging.info('ex nums:%d' % (len(exampels))) with open(o_file_name, 'w') as fout: fout.write("qid\tlabel\tscore\ttext_a\n") for ex in exampels: try: fout.write("{}\t{}\t{}\t{}\n".format(ex.qid, ex.label, ex.score, ex.text_a.replace('\t', ''))) except Exception as e: print(ex.qid, ex.text_a, ex.label, ex.score) raise e if __name__ == "__main__": write_to_one("train", 'train.txt') write_to_one("test", "test.txt")