未验证 提交 224fe10d 编写于 作者: Z zhang wenhui 提交者: GitHub

fix python3 reload bug (#3733)

上级 a46365b6
......@@ -3,8 +3,10 @@ import six
import collections
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(input_file, word_freq=None):
"""
......@@ -42,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
return word_idx
def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_dir):
def write_paddle(word_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf:
with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......@@ -61,7 +66,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf:
with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......@@ -69,7 +76,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab):
vocab = build_dict(0, train_dir, test_dir)
with open(output_vocab, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab)) + "\n")
......@@ -82,4 +91,5 @@ test_dir = sys.argv[2]
output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab = sys.argv[5]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab)
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab)
......@@ -2,11 +2,13 @@ import sys
import six
import collections
import os
import csv
import csv
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(column_num, input_file, word_freq=None):
"""
......@@ -16,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None):
word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file)
for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()):
word_freq[w]+= 1
for w in re.split(r'\W+', row[column_num].strip()):
word_freq[w] += 1
return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
......@@ -42,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir):
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir)
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf:
with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -65,7 +71,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
os.mkdir(output_test_dir)
for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf:
with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -77,7 +85,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
wf.write(str(w) + " ")
wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf:
......@@ -88,7 +98,8 @@ def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_v
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir)
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
output_test_dir)
train_dir = sys.argv[1]
......@@ -97,4 +108,5 @@ output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag)
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag)
......@@ -9,8 +9,9 @@ import paddle.fluid as fluid
import paddle
import csv
import io
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def to_lodtensor(data, place):
......
......@@ -10,8 +10,10 @@ import paddle.fluid as fluid
import paddle
import net
import utils
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
def parse_args():
parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example")
......@@ -77,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader():
step_id += 1
b_size = len([dat[0] for dat in data])
wa = np.array(
[dat[0] for dat in data]).astype("int64").reshape(
b_size)
wb = np.array(
[dat[1] for dat in data]).astype("int64").reshape(
b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size)
wa = np.array([dat[0] for dat in data]).astype(
"int64").reshape(b_size)
wb = np.array([dat[1] for dat in data]).astype(
"int64").reshape(b_size)
wc = np.array([dat[2] for dat in data]).astype(
"int64").reshape(b_size)
label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data]
......@@ -94,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_a": wa,
"analogy_b": wb,
"analogy_c": wc,
"all_label":
np.arange(vocab_size).reshape(
vocab_size).astype("int64"),
"all_label": np.arange(vocab_size)
.reshape(vocab_size).astype("int64"),
},
fetch_list=[pred.name, values],
return_numpy=False)
......@@ -144,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader():
step_id += 1
b_size = len([dat[0] for dat in data])
wa = np.array(
[dat[0] for dat in data]).astype("int64").reshape(
b_size)
wb = np.array(
[dat[1] for dat in data]).astype("int64").reshape(
b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size)
wa = np.array([dat[0] for dat in data]).astype(
"int64").reshape(b_size)
wb = np.array([dat[1] for dat in data]).astype(
"int64").reshape(b_size)
wc = np.array([dat[2] for dat in data]).astype(
"int64").reshape(b_size)
label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data]
......
......@@ -7,8 +7,9 @@ import argparse
import io
import math
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
prog = re.compile("[^a-z ]", flags=0)
......@@ -113,9 +114,13 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(os.path.join(args.output_corpus_dir, 'convert_' + file), "w", encoding='utf-8') as wf:
with io.open(
os.path.join(args.output_corpus_dir, 'convert_' + file),
"w",
encoding='utf-8') as wf:
with io.open(
os.path.join(args.input_corpus_dir, file), encoding='utf-8') as rf:
os.path.join(args.input_corpus_dir, file),
encoding='utf-8') as rf:
print(os.path.join(args.input_corpus_dir, file))
for line in rf:
signal = False
......
......@@ -14,8 +14,9 @@ from net import skip_gram_word2vec
import utils
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf-8')
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册