From 2785c5d3a764f086fd7fcfb08228b27b3ba097f7 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Thu, 17 Oct 2019 12:56:19 +0800 Subject: [PATCH] fix os.path & encoding (#3633) * update api in PaddleRec, test=release/1.6 * fix encoding, os.path.join, test=release/1.6 * fix encoding, os.path.join, test=release/1.6 --- PaddleRec/gru4rec/README.md | 2 +- PaddleRec/gru4rec/convert_format.py | 4 ++-- PaddleRec/gru4rec/text2paddle.py | 17 ++++++++++------- PaddleRec/gru4rec/utils.py | 4 ++-- PaddleRec/ssr/README.md | 2 +- PaddleRec/ssr/reader.py | 4 ++-- PaddleRec/ssr/utils.py | 2 +- PaddleRec/tagspace/text2paddle.py | 19 +++++++++++-------- PaddleRec/tagspace/utils.py | 6 ++++-- PaddleRec/word2vec/README.md | 2 +- PaddleRec/word2vec/infer.py | 3 ++- PaddleRec/word2vec/preprocess.py | 9 ++++++--- PaddleRec/word2vec/train.py | 5 +++++ PaddleRec/word2vec/utils.py | 4 ++-- 14 files changed, 50 insertions(+), 33 deletions(-) diff --git a/PaddleRec/gru4rec/README.md b/PaddleRec/gru4rec/README.md index 581fda3e..15a9b106 100644 --- a/PaddleRec/gru4rec/README.md +++ b/PaddleRec/gru4rec/README.md @@ -281,7 +281,7 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2 可参考cluster_train.py 配置其他多机环境 -运行命令本地模拟多机场景 +运行命令本地模拟多机场景, 暂不支持windows ``` sh cluster_train.sh ``` diff --git a/PaddleRec/gru4rec/convert_format.py b/PaddleRec/gru4rec/convert_format.py index b5db511e..7bca1d52 100644 --- a/PaddleRec/gru4rec/convert_format.py +++ b/PaddleRec/gru4rec/convert_format.py @@ -2,8 +2,8 @@ import sys def convert_format(input, output): - with open(input) as rf: - with open(output, "w") as wf: + with open(input, "r", encoding='utf-8') as rf: + with open(output, "w", encoding='utf-8') as wf: last_sess = -1 sign = 1 i = 0 diff --git a/PaddleRec/gru4rec/text2paddle.py b/PaddleRec/gru4rec/text2paddle.py index 563a8cad..9b7b98e6 100644 --- a/PaddleRec/gru4rec/text2paddle.py +++ b/PaddleRec/gru4rec/text2paddle.py @@ -2,6 +2,9 @@ import sys import six import collections import os +import sys +reload(sys) +sys.setdefaultencoding('utf-8') def word_count(input_file, word_freq=None): """ @@ -25,11 +28,11 @@ def build_dict(min_word_freq=0, train_dir="", test_dir=""): word_freq = collections.defaultdict(int) files = os.listdir(train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(f, word_freq) files = os.listdir(test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(f, word_freq) word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] @@ -44,8 +47,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di if not os.path.exists(output_train_dir): os.mkdir(output_train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: - with open(output_train_dir + '/' + fi, "w") as wf: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf: for l in f: l = l.strip().split() l = [word_idx.get(w) for w in l] @@ -57,8 +60,8 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di if not os.path.exists(output_test_dir): os.mkdir(output_test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: - with open(output_test_dir + '/' + fi, "w") as wf: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf: for l in f: l = l.strip().split() l = [word_idx.get(w) for w in l] @@ -68,7 +71,7 @@ def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_di def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab): vocab = build_dict(0, train_dir, test_dir) - with open(output_vocab, "w") as wf: + with open(output_vocab, "w", encoding='utf-8') as wf: wf.write(str(len(vocab)) + "\n") #wf.write(str(vocab)) write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir) diff --git a/PaddleRec/gru4rec/utils.py b/PaddleRec/gru4rec/utils.py index ffd05fc5..1653483a 100644 --- a/PaddleRec/gru4rec/utils.py +++ b/PaddleRec/gru4rec/utils.py @@ -86,7 +86,7 @@ def to_lodtensor_bpr_test(raw_data, vocab_size, place): def get_vocab_size(vocab_path): - with open(vocab_path, "r") as rf: + with open(vocab_path, "r", encoding='utf-8') as rf: line = rf.readline() return int(line.strip()) @@ -184,7 +184,7 @@ def reader_creator(file_dir, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: if DataType.SEQ == data_type: l = l.strip().split() diff --git a/PaddleRec/ssr/README.md b/PaddleRec/ssr/README.md index 6ded50b9..6abc5240 100644 --- a/PaddleRec/ssr/README.md +++ b/PaddleRec/ssr/README.md @@ -43,7 +43,7 @@ cpu 单机多卡训练 CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10 ``` -本地模拟多机训练 +本地模拟多机训练, 不支持windows. ``` bash sh cluster_train.sh ``` diff --git a/PaddleRec/ssr/reader.py b/PaddleRec/ssr/reader.py index 15989fd8..7752ca3f 100644 --- a/PaddleRec/ssr/reader.py +++ b/PaddleRec/ssr/reader.py @@ -33,7 +33,7 @@ class YoochooseVocab(Vocab): def load(self, filelist): idx = 0 for f in filelist: - with open(f, "r") as fin: + with open(f, "r", encoding='utf-8') as fin: for line in fin: group = line.strip().split() for item in group: @@ -64,7 +64,7 @@ class YoochooseDataset(Dataset): def _reader_creator(self, filelist, is_train): def reader(): for f in filelist: - with open(f, 'r') as fin: + with open(f, 'r', encoding='utf-8') as fin: line_idx = 0 for line in fin: ids = line.strip().split() diff --git a/PaddleRec/ssr/utils.py b/PaddleRec/ssr/utils.py index 353cf336..27cb75ea 100644 --- a/PaddleRec/ssr/utils.py +++ b/PaddleRec/ssr/utils.py @@ -7,7 +7,7 @@ import paddle def get_vocab_size(vocab_path): - with open(vocab_path, "r") as rf: + with open(vocab_path, "r", encoding='utf-8') as rf: line = rf.readline() return int(line.strip()) diff --git a/PaddleRec/tagspace/text2paddle.py b/PaddleRec/tagspace/text2paddle.py index 6aa040c0..0727ba98 100644 --- a/PaddleRec/tagspace/text2paddle.py +++ b/PaddleRec/tagspace/text2paddle.py @@ -4,6 +4,9 @@ import collections import os import csv import re +import sys +reload(sys) +sys.setdefaultencoding('utf-8') def word_count(column_num, input_file, word_freq=None): """ @@ -25,11 +28,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""): word_freq = collections.defaultdict(int) files = os.listdir(train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) files = os.listdir(test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: word_freq = word_count(column_num, f, word_freq) word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] @@ -44,8 +47,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu if not os.path.exists(output_train_dir): os.mkdir(output_train_dir) for fi in files: - with open(train_dir + '/' + fi, "r") as f: - with open(output_train_dir + '/' + fi, "w") as wf: + with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f: + with open(os.path.join(output_train_dir, fi), "w", encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) @@ -61,8 +64,8 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu if not os.path.exists(output_test_dir): os.mkdir(output_test_dir) for fi in files: - with open(test_dir + '/' + fi, "r") as f: - with open(output_test_dir + '/' + fi, "w") as wf: + with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f: + with open(os.path.join(output_test_dir, fi), "w", encoding='utf-8') as wf: data_file = csv.reader(f) for row in data_file: tag_raw = re.split(r'\W+', row[0].strip()) @@ -77,11 +80,11 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir, outpu def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab_text, output_vocab_tag): print("start constuct word dict") vocab_text = build_dict(2, 0, train_dir, test_dir) - with open(output_vocab_text, "w") as wf: + with open(output_vocab_text, "w", encoding='utf-8') as wf: wf.write(str(len(vocab_text)) + "\n") vocab_tag = build_dict(0, 0, train_dir, test_dir) - with open(output_vocab_tag, "w") as wf: + with open(output_vocab_tag, "w", encoding='utf-8') as wf: wf.write(str(len(vocab_tag)) + "\n") print("construct word dict done\n") diff --git a/PaddleRec/tagspace/utils.py b/PaddleRec/tagspace/utils.py index 80b77c8e..14a0b9f1 100644 --- a/PaddleRec/tagspace/utils.py +++ b/PaddleRec/tagspace/utils.py @@ -8,6 +8,8 @@ import numpy as np import paddle.fluid as fluid import paddle import csv +reload(sys) +sys.setdefaultencoding('utf-8') def to_lodtensor(data, place): """ convert to LODtensor """ @@ -126,7 +128,7 @@ def train_reader_creator(file_dir, tag_size, neg_size, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: l = l.strip().split(",") pos_index = int(l[0]) @@ -156,7 +158,7 @@ def test_reader_creator(file_dir, tag_size, n, data_type): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for l in f: l = l.strip().split(",") pos_index = int(l[0]) diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md index 912aaa6a..581c81ac 100644 --- a/PaddleRec/word2vec/README.md +++ b/PaddleRec/word2vec/README.md @@ -97,7 +97,7 @@ python train.py -h OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse ``` -本地单机模拟多机训练 +本地单机模拟多机训练, 目前暂不支持windows。 ```bash sh cluster_train.sh diff --git a/PaddleRec/word2vec/infer.py b/PaddleRec/word2vec/infer.py index 36357dd6..7eb5dd5d 100644 --- a/PaddleRec/word2vec/infer.py +++ b/PaddleRec/word2vec/infer.py @@ -10,7 +10,8 @@ import paddle.fluid as fluid import paddle import net import utils - +reload(sys) +sys.setdefaultencoding('utf-8') def parse_args(): parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example") diff --git a/PaddleRec/word2vec/preprocess.py b/PaddleRec/word2vec/preprocess.py index 1d5ad03c..3af99305 100644 --- a/PaddleRec/word2vec/preprocess.py +++ b/PaddleRec/word2vec/preprocess.py @@ -6,6 +6,9 @@ import six import argparse import io import math +import sys +reload(sys) +sys.setdefaultencoding('utf-8') prog = re.compile("[^a-z ]", flags=0) @@ -110,10 +113,10 @@ def filter_corpus(args): if not os.path.exists(args.output_corpus_dir): os.makedirs(args.output_corpus_dir) for file in os.listdir(args.input_corpus_dir): - with io.open(args.output_corpus_dir + '/convert_' + file, "w") as wf: + with io.open(os.path.join(args.output_corpus_dir, 'convert_' + file), "w", encoding='utf-8') as wf: with io.open( - args.input_corpus_dir + '/' + file, encoding='utf-8') as rf: - print(args.input_corpus_dir + '/' + file) + os.path.join(args.input_corpus_dir, file), encoding='utf-8') as rf: + print(os.path.join(args.input_corpus_dir, file)) for line in rf: signal = False line = text_strip(line) diff --git a/PaddleRec/word2vec/train.py b/PaddleRec/word2vec/train.py index bcc99654..86196e4c 100644 --- a/PaddleRec/word2vec/train.py +++ b/PaddleRec/word2vec/train.py @@ -12,6 +12,11 @@ import six import reader from net import skip_gram_word2vec +import utils +import sys +reload(sys) +sys.setdefaultencoding('utf-8') + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) diff --git a/PaddleRec/word2vec/utils.py b/PaddleRec/word2vec/utils.py index 0d173005..67222408 100644 --- a/PaddleRec/word2vec/utils.py +++ b/PaddleRec/word2vec/utils.py @@ -12,7 +12,7 @@ import preprocess def BuildWord_IdMap(dict_path): word_to_id = dict() id_to_word = dict() - with open(dict_path, 'r') as f: + with open(dict_path, 'r', encoding='utf-8') as f: for line in f: word_to_id[line.split(' ')[0]] = int(line.split(' ')[1]) id_to_word[int(line.split(' ')[1])] = line.split(' ')[0] @@ -89,7 +89,7 @@ def reader_creator(file_dir, word_to_id): def reader(): files = os.listdir(file_dir) for fi in files: - with open(file_dir + '/' + fi, "r") as f: + with open(os.path.join(file_dir, fi), "r", encoding='utf-8') as f: for line in f: if ':' in line: pass -- GitLab