提交 406f26ae 编写于 作者: Y yinhaofeng

support py2

上级 68cc383f
...@@ -18,6 +18,7 @@ import collections ...@@ -18,6 +18,7 @@ import collections
import os import os
import csv import csv
import re import re
import io
import sys import sys
if six.PY2: if six.PY2:
reload(sys) reload(sys)
...@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): ...@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int) word_freq = collections.defaultdict(int)
files = os.listdir(train_dir) files = os.listdir(train_dir)
for fi in files: for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq) word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir) files = os.listdir(test_dir)
for fi in files: for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq) word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
...@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, ...@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
if not os.path.exists(output_train_dir): if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir) os.mkdir(output_train_dir)
for fi in files: for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open( with io.open(
os.path.join(output_train_dir, fi), "w", os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf: encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0]) pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",") wf.write(u"{},".format(str(pos_index)))
text_raw = re.split(r'\W+', row[2].strip()) text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw] l = [text_idx.get(w) for w in text_raw]
for w in l: for w in l:
wf.write(str(w) + " ") wf.write(u"{} ".format(str(w)))
wf.write("\n") wf.write(u"\n")
files = os.listdir(test_dir) files = os.listdir(test_dir)
if not os.path.exists(output_test_dir): if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir) os.mkdir(output_test_dir)
for fi in files: for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open( with io.open(
os.path.join(output_test_dir, fi), "w", os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf: encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
pos_index = tag_idx.get(tag_raw[0]) pos_index = tag_idx.get(tag_raw[0])
wf.write(str(pos_index) + ",") wf.write(u"{},".format(str(pos_index)))
text_raw = re.split(r'\W+', row[2].strip()) text_raw = re.split(r'\W+', row[2].strip())
l = [text_idx.get(w) for w in text_raw] l = [text_idx.get(w) for w in text_raw]
for w in l: for w in l:
wf.write(str(w) + " ") wf.write(u"{} ".format(str(w)))
wf.write("\n") wf.write(u"\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag): output_vocab_text, output_vocab_tag):
print("start constuct word dict") print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir) vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf: with io.open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n") wf.write(u"{}\n".format(str(len(vocab_text))))
vocab_tag = build_dict(0, 0, train_dir, test_dir) vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w", encoding='utf-8') as wf: with io.open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n") wf.write(u"{}\n".format(str(len(vocab_tag))))
print("construct word dict done\n") print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册