diff --git a/models/contentunderstanding/tagspace/data/text2paddle.py b/models/contentunderstanding/tagspace/data/text2paddle.py index 9b4a4fafb2945bfb30869dafb86e066096a8bea0..54d7848d7d84649ff8970be0ce63b1d2fe429a28 100644 --- a/models/contentunderstanding/tagspace/data/text2paddle.py +++ b/models/contentunderstanding/tagspace/data/text2paddle.py @@ -18,6 +18,7 @@ import collections import os import csv import re +import io import sys if six.PY2: reload(sys) @@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): word_freq = collections.defaultdict(int) files = os.listdir(train_dir) for fi in files: - with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) files = os.listdir(test_dir) for fi in files: - with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] @@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, if not os.path.exists(output_train_dir): os.mkdir(output_train_dir) for fi in files: - with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: - with open( + with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with io.open( os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) pos_index = tag_idx.get(tag_raw[0]) - wf.write(str(pos_index) + ",") + wf.write(u"{},".format(str(pos_index))) text_raw = re.split(r'\W+', row[2].strip()) l = [text_idx.get(w) for w in text_raw] for w in l: - wf.write(str(w) + " ") - wf.write("\n") + wf.write(u"{} ".format(str(w))) + wf.write(u"\n") files = os.listdir(test_dir) if not os.path.exists(output_test_dir): os.mkdir(output_test_dir) for fi in files: - with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: - with open( + with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with io.open( os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) pos_index = tag_idx.get(tag_raw[0]) - wf.write(str(pos_index) + ",") + wf.write(u"{},".format(str(pos_index))) text_raw = re.split(r'\W+', row[2].strip()) l = [text_idx.get(w) for w in text_raw] for w in l: - wf.write(str(w) + " ") - wf.write("\n") + wf.write(u"{} ".format(str(w))) + wf.write(u"\n") def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag): print("start constuct word dict") vocab_text = build_dict(2, 0, train_dir, test_dir) - with open(output_vocab_text, "w", encoding='utf-8') as wf: - wf.write(str(len(vocab_text)) + "\n") + with io.open(output_vocab_text, "w", encoding='utf-8') as wf: + wf.write(u"{}\n".format(str(len(vocab_text)))) vocab_tag = build_dict(0, 0, train_dir, test_dir) - with open(output_vocab_tag, "w", encoding='utf-8') as wf: - wf.write(str(len(vocab_tag)) + "\n") + with io.open(output_vocab_tag, "w", encoding='utf-8') as wf: + wf.write(u"{}\n".format(str(len(vocab_tag)))) print("construct word dict done\n") write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,