未验证 提交 224fe10d 编写于 作者: Z zhang wenhui 提交者: GitHub

fix python3 reload bug (#3733)

上级 a46365b6
...@@ -3,8 +3,10 @@ import six ...@@ -3,8 +3,10 @@ import six
import collections import collections
import os import os
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(input_file, word_freq=None): def word_count(input_file, word_freq=None):
""" """
...@@ -42,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): ...@@ -42,13 +44,16 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
return word_idx return word_idx
def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_dir): def write_paddle(word_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir) files = os.listdir(train_dir)
if not os.path.exists(output_train_dir): if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir) os.mkdir(output_train_dir)
for fi in files: for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf: with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
for l in f: for l in f:
l = l.strip().split() l = l.strip().split()
l = [word_idx.get(w) for w in l] l = [word_idx.get(w) for w in l]
...@@ -61,7 +66,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di ...@@ -61,7 +66,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
os.mkdir(output_test_dir) os.mkdir(output_test_dir)
for fi in files: for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf: with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
for l in f: for l in f:
l = l.strip().split() l = l.strip().split()
l = [word_idx.get(w) for w in l] l = [word_idx.get(w) for w in l]
...@@ -69,7 +76,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di ...@@ -69,7 +76,9 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
wf.write(str(w) + " ") wf.write(str(w) + " ")
wf.write("\n") wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab):
vocab = build_dict(0, train_dir, test_dir) vocab = build_dict(0, train_dir, test_dir)
with open(output_vocab, "w", encoding='utf-8') as wf: with open(output_vocab, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab)) + "\n") wf.write(str(len(vocab)) + "\n")
...@@ -82,4 +91,5 @@ test_dir = sys.argv[2] ...@@ -82,4 +91,5 @@ test_dir = sys.argv[2]
output_train_dir = sys.argv[3] output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4] output_test_dir = sys.argv[4]
output_vocab = sys.argv[5] output_vocab = sys.argv[5]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab) text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab)
...@@ -2,11 +2,13 @@ import sys ...@@ -2,11 +2,13 @@ import sys
import six import six
import collections import collections
import os import os
import csv import csv
import re import re
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(column_num, input_file, word_freq=None): def word_count(column_num, input_file, word_freq=None):
""" """
...@@ -16,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None): ...@@ -16,10 +18,11 @@ def word_count(column_num, input_file, word_freq=None):
word_freq = collections.defaultdict(int) word_freq = collections.defaultdict(int)
data_file = csv.reader(input_file) data_file = csv.reader(input_file)
for row in data_file: for row in data_file:
for w in re.split(r'\W+',row[column_num].strip()): for w in re.split(r'\W+', row[column_num].strip()):
word_freq[w]+= 1 word_freq[w] += 1
return word_freq return word_freq
def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
""" """
Build a word dictionary from the corpus, Keys of the dictionary are words, Build a word dictionary from the corpus, Keys of the dictionary are words,
...@@ -42,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): ...@@ -42,13 +45,16 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
return word_idx return word_idx
def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, output_test_dir): def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
output_test_dir):
files = os.listdir(train_dir) files = os.listdir(train_dir)
if not os.path.exists(output_train_dir): if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir) os.mkdir(output_train_dir)
for fi in files: for fi in files:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf: with open(
os.path.join(output_train_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
...@@ -65,7 +71,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu ...@@ -65,7 +71,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
os.mkdir(output_test_dir) os.mkdir(output_test_dir)
for fi in files: for fi in files:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf: with open(
os.path.join(output_test_dir, fi), "w",
encoding='utf-8') as wf:
data_file = csv.reader(f) data_file = csv.reader(f)
for row in data_file: for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip()) tag_raw = re.split(r'\W+', row[0].strip())
...@@ -77,7 +85,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu ...@@ -77,7 +85,9 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
wf.write(str(w) + " ") wf.write(str(w) + " ")
wf.write("\n") wf.write("\n")
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag):
print("start constuct word dict") print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir) vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w", encoding='utf-8') as wf: with open(output_vocab_text, "w", encoding='utf-8') as wf:
...@@ -88,7 +98,8 @@ def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_v ...@@ -88,7 +98,8 @@ def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_v
wf.write(str(len(vocab_tag)) + "\n") wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n") print("construct word dict done\n")
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir, output_test_dir) write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,
output_test_dir)
train_dir = sys.argv[1] train_dir = sys.argv[1]
...@@ -97,4 +108,5 @@ output_train_dir = sys.argv[3] ...@@ -97,4 +108,5 @@ output_train_dir = sys.argv[3]
output_test_dir = sys.argv[4] output_test_dir = sys.argv[4]
output_vocab_text = sys.argv[5] output_vocab_text = sys.argv[5]
output_vocab_tag = sys.argv[6] output_vocab_tag = sys.argv[6]
text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag) text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
output_vocab_text, output_vocab_tag)
...@@ -9,8 +9,9 @@ import paddle.fluid as fluid ...@@ -9,8 +9,9 @@ import paddle.fluid as fluid
import paddle import paddle
import csv import csv
import io import io
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
def to_lodtensor(data, place): def to_lodtensor(data, place):
......
...@@ -10,8 +10,10 @@ import paddle.fluid as fluid ...@@ -10,8 +10,10 @@ import paddle.fluid as fluid
import paddle import paddle
import net import net
import utils import utils
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example") parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example")
...@@ -77,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -77,15 +79,12 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader(): for data in test_reader():
step_id += 1 step_id += 1
b_size = len([dat[0] for dat in data]) b_size = len([dat[0] for dat in data])
wa = np.array( wa = np.array([dat[0] for dat in data]).astype(
[dat[0] for dat in data]).astype("int64").reshape( "int64").reshape(b_size)
b_size) wb = np.array([dat[1] for dat in data]).astype(
wb = np.array( "int64").reshape(b_size)
[dat[1] for dat in data]).astype("int64").reshape( wc = np.array([dat[2] for dat in data]).astype(
b_size) "int64").reshape(b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size)
label = [dat[3] for dat in data] label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data] input_word = [dat[4] for dat in data]
...@@ -94,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -94,9 +93,8 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_a": wa, "analogy_a": wa,
"analogy_b": wb, "analogy_b": wb,
"analogy_c": wc, "analogy_c": wc,
"all_label": "all_label": np.arange(vocab_size)
np.arange(vocab_size).reshape( .reshape(vocab_size).astype("int64"),
vocab_size).astype("int64"),
}, },
fetch_list=[pred.name, values], fetch_list=[pred.name, values],
return_numpy=False) return_numpy=False)
...@@ -144,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w): ...@@ -144,15 +142,12 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
for data in test_reader(): for data in test_reader():
step_id += 1 step_id += 1
b_size = len([dat[0] for dat in data]) b_size = len([dat[0] for dat in data])
wa = np.array( wa = np.array([dat[0] for dat in data]).astype(
[dat[0] for dat in data]).astype("int64").reshape( "int64").reshape(b_size)
b_size) wb = np.array([dat[1] for dat in data]).astype(
wb = np.array( "int64").reshape(b_size)
[dat[1] for dat in data]).astype("int64").reshape( wc = np.array([dat[2] for dat in data]).astype(
b_size) "int64").reshape(b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size)
label = [dat[3] for dat in data] label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data] input_word = [dat[4] for dat in data]
......
...@@ -7,8 +7,9 @@ import argparse ...@@ -7,8 +7,9 @@ import argparse
import io import io
import math import math
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
prog = re.compile("[^a-z ]", flags=0) prog = re.compile("[^a-z ]", flags=0)
...@@ -113,9 +114,13 @@ def filter_corpus(args): ...@@ -113,9 +114,13 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir): if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir) os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir): for file in os.listdir(args.input_corpus_dir):
with io.open(os.path.join(args.output_corpus_dir, 'convert_' + file), "w", encoding='utf-8') as wf: with io.open(
os.path.join(args.output_corpus_dir, 'convert_' + file),
"w",
encoding='utf-8') as wf:
with io.open( with io.open(
os.path.join(args.input_corpus_dir, file), encoding='utf-8') as rf: os.path.join(args.input_corpus_dir, file),
encoding='utf-8') as rf:
print(os.path.join(args.input_corpus_dir, file)) print(os.path.join(args.input_corpus_dir, file))
for line in rf: for line in rf:
signal = False signal = False
......
...@@ -14,8 +14,9 @@ from net import skip_gram_word2vec ...@@ -14,8 +14,9 @@ from net import skip_gram_word2vec
import utils import utils
import sys import sys
reload(sys) if six.PY2:
sys.setdefaultencoding('utf-8') reload(sys)
sys.setdefaultencoding('utf-8')
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册