“dcfe198631058dbcd4fe6e887a4e514008ed1e68”上不存在“paddle/phi/kernels/selected_rows/assign_kernel.h”
未验证 提交 2785c5d3 编写于 作者: Z zhang wenhui 提交者: GitHub

fix os.path & encoding (#3633)

* update api in PaddleRec, test=release/1.6

* fix encoding, os.path.join, test=release/1.6

* fix encoding, os.path.join, test=release/1.6
上级 365971e5
......@@ -281,7 +281,7 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2
可参考cluster_train.py 配置其他多机环境
运行命令本地模拟多机场景
运行命令本地模拟多机场景, 暂不支持windows
```
sh cluster_train.sh
```
......
......@@ -2,8 +2,8 @@ import sys
def convert_format(input, output):
with open(input) as rf:
with open(output, "w") as wf:
with open(input, "r", encoding='utf-8') as rf:
with open(output, "w", encoding='utf-8') as wf:
last_sess = -1
sign = 1
i = 0
......
......@@ -2,6 +2,9 @@ import sys
import six
import collections
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(input_file, word_freq=None):
"""
......@@ -25,11 +28,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
......@@ -44,8 +47,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(output_train_dir + '/' + fi, "w") as wf:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......@@ -57,8 +60,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(output_test_dir + '/' + fi, "w") as wf:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf:
for l in f:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
......@@ -68,7 +71,7 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab):
vocab = build_dict(0, train_dir, test_dir)
with open(output_vocab, "w") as wf:
with open(output_vocab, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab)) + "\n")
#wf.write(str(vocab))
write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir)
......
......@@ -86,7 +86,7 @@ def to_lodtensor_bpr_test(raw_data, vocab_size, place):
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
with open(vocab_path, "r", encoding='utf-8') as rf:
line = rf.readline()
return int(line.strip())
......@@ -184,7 +184,7 @@ def reader_creator(file_dir, n, data_type):
def reader():
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f:
if DataType.SEQ == data_type:
l = l.strip().split()
......
......@@ -43,7 +43,7 @@ cpu 单机多卡训练
CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
```
本地模拟多机训练
本地模拟多机训练, 不支持windows.
``` bash
sh cluster_train.sh
```
......
......@@ -33,7 +33,7 @@ class YoochooseVocab(Vocab):
def load(self, filelist):
idx = 0
for f in filelist:
with open(f, "r") as fin:
with open(f, "r", encoding='utf-8') as fin:
for line in fin:
group = line.strip().split()
for item in group:
......@@ -64,7 +64,7 @@ class YoochooseDataset(Dataset):
def _reader_creator(self, filelist, is_train):
def reader():
for f in filelist:
with open(f, 'r') as fin:
with open(f, 'r', encoding='utf-8') as fin:
line_idx = 0
for line in fin:
ids = line.strip().split()
......
......@@ -7,7 +7,7 @@ import paddle
def get_vocab_size(vocab_path):
with open(vocab_path, "r") as rf:
with open(vocab_path, "r", encoding='utf-8') as rf:
line = rf.readline()
return int(line.strip())
......
......@@ -4,6 +4,9 @@ import collections
import os
import csv
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def word_count(column_num, input_file, word_freq=None):
"""
......@@ -25,11 +28,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
word_freq = collections.defaultdict(int)
files = os.listdir(train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
files = os.listdir(test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
word_freq = word_count(column_num, f, word_freq)
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
......@@ -44,8 +47,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
if not os.path.exists(output_train_dir):
os.mkdir(output_train_dir)
for fi in files:
with open(train_dir + '/' + fi, "r") as f:
with open(output_train_dir + '/' + fi, "w") as wf:
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -61,8 +64,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
if not os.path.exists(output_test_dir):
os.mkdir(output_test_dir)
for fi in files:
with open(test_dir + '/' + fi, "r") as f:
with open(output_test_dir + '/' + fi, "w") as wf:
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf:
data_file = csv.reader(f)
for row in data_file:
tag_raw = re.split(r'\W+', row[0].strip())
......@@ -77,11 +80,11 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag):
print("start constuct word dict")
vocab_text = build_dict(2, 0, train_dir, test_dir)
with open(output_vocab_text, "w") as wf:
with open(output_vocab_text, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_text)) + "\n")
vocab_tag = build_dict(0, 0, train_dir, test_dir)
with open(output_vocab_tag, "w") as wf:
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
wf.write(str(len(vocab_tag)) + "\n")
print("construct word dict done\n")
......
......@@ -8,6 +8,8 @@ import numpy as np
import paddle.fluid as fluid
import paddle
import csv
reload(sys)
sys.setdefaultencoding('utf-8')
def to_lodtensor(data, place):
""" convert to LODtensor """
......@@ -126,7 +128,7 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type):
def reader():
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
......@@ -156,7 +158,7 @@ def test_reader_creator(file_dir, tag_size, n, data_type):
def reader():
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for l in f:
l = l.strip().split(",")
pos_index = int(l[0])
......
......@@ -97,7 +97,7 @@ python train.py -h
OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
```
本地单机模拟多机训练
本地单机模拟多机训练, 目前暂不支持windows。
```bash
sh cluster_train.sh
......
......@@ -10,7 +10,8 @@ import paddle.fluid as fluid
import paddle
import net
import utils
reload(sys)
sys.setdefaultencoding('utf-8')
def parse_args():
parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example")
......
......@@ -6,6 +6,9 @@ import six
import argparse
import io
import math
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
prog = re.compile("[^a-z ]", flags=0)
......@@ -110,10 +113,10 @@ def filter_corpus(args):
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file, "w") as wf:
with io.open(os.path.join(args.output_corpus_dir, 'convert_' + file), "w", encoding='utf-8') as wf:
with io.open(
args.input_corpus_dir + '/' + file, encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file)
os.path.join(args.input_corpus_dir, file), encoding='utf-8') as rf:
print(os.path.join(args.input_corpus_dir, file))
for line in rf:
signal = False
line = text_strip(line)
......
......@@ -12,6 +12,11 @@ import six
import reader
from net import skip_gram_word2vec
import utils
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
......
......@@ -12,7 +12,7 @@ import preprocess
def BuildWord_IdMap(dict_path):
word_to_id = dict()
id_to_word = dict()
with open(dict_path, 'r') as f:
with open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
......@@ -89,7 +89,7 @@ def reader_creator(file_dir, word_to_id):
def reader():
files = os.listdir(file_dir)
for fi in files:
with open(file_dir + '/' + fi, "r") as f:
with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f:
for line in f:
if ':' in line:
pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册