From b8555a93b4d5aca679891e842c58978d617d37b6 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 9 Jan 2019 07:28:13 +0000 Subject: [PATCH] fix preprocess --- fluid/PaddleRec/word2vec/README.cn.md | 3 +++ fluid/PaddleRec/word2vec/README.md | 5 +++++ fluid/PaddleRec/word2vec/async_train.py | 3 ++- fluid/PaddleRec/word2vec/reader.py | 14 +++++++++++--- fluid/PaddleRec/word2vec/train.py | 11 +++++++++-- 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fluid/PaddleRec/word2vec/README.cn.md b/fluid/PaddleRec/word2vec/README.cn.md index 37d3fd6d..f83444c1 100644 --- a/fluid/PaddleRec/word2vec/README.cn.md +++ b/fluid/PaddleRec/word2vec/README.cn.md @@ -45,12 +45,15 @@ python train.py \ --with_nce --is_local \ 2>&1 | tee train.log ``` + 使用async executor ```bash python async_train.py --train_data_path ./async_data/ \ --dict_path data/1-billion_dict --with_nce --with_hs \ --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log ``` + +如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它 ### 分布式训练 本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率 diff --git a/fluid/PaddleRec/word2vec/README.md b/fluid/PaddleRec/word2vec/README.md index a823d822..aca7002a 100644 --- a/fluid/PaddleRec/word2vec/README.md +++ b/fluid/PaddleRec/word2vec/README.md @@ -53,6 +53,7 @@ python train.py \ --with_nce --is_local \ 2>&1 | tee train.log ``` + with async executor ```bash python async_train.py --train_data_path ./async_data/ \ @@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \ --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log ``` +if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you +save the vocab you will use and set --with_other_dict flag on to using it. + + ### Distributed Train Run a 2 pserver 2 trainer distribute training on a single machine. In distributed training setting, training data is splited by trainer_id, so that training data diff --git a/fluid/PaddleRec/word2vec/async_train.py b/fluid/PaddleRec/word2vec/async_train.py index 29094d9a..91301088 100644 --- a/fluid/PaddleRec/word2vec/async_train.py +++ b/fluid/PaddleRec/word2vec/async_train.py @@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num): def train_loop(args, train_program, reader, py_reader, loss, trainer_id): train_reader = paddle.batch( paddle.reader.shuffle( - reader.train((args.with_hs or (not args.with_nce))), + reader.train((args.with_hs or (not args.with_nce)), + args.with_other_dict), buf_size=args.batch_size * 100), batch_size=args.batch_size) diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py index 1b1b6a9d..69ae84c8 100644 --- a/fluid/PaddleRec/word2vec/reader.py +++ b/fluid/PaddleRec/word2vec/reader.py @@ -105,7 +105,7 @@ class Word2VecReader(object): return set(targets) - def train(self, with_hs): + def train(self, with_hs, with_other_dict): def _reader(): for file in self.filelist: with io.open( @@ -116,7 +116,11 @@ class Word2VecReader(object): count = 1 for line in f: if self.trainer_id == count % self.trainer_num: - line = preprocess.strip_lines(line, self.word_count) + if with_other_dict: + line = preprocess.strip_lines(line, + self.word_count) + else: + line = preprocess.text_strip(line) word_ids = [ self.word_to_id_[word] for word in line.split() if word in self.word_to_id_ @@ -140,7 +144,11 @@ class Word2VecReader(object): count = 1 for line in f: if self.trainer_id == count % self.trainer_num: - line = preprocess.strip_lines(line, self.word_count) + if with_other_dict: + line = preprocess.strip_lines(line, + self.word_count) + else: + line = preprocess.text_strip(line) word_ids = [ self.word_to_id_[word] for word in line.split() if word in self.word_to_id_ diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py index df32e0b4..6fbe7a79 100644 --- a/fluid/PaddleRec/word2vec/train.py +++ b/fluid/PaddleRec/word2vec/train.py @@ -116,6 +116,13 @@ def parse_args(): default=False, help='Do inference every 100 batches , (default: False)') + parser.add_argument( + '--with_other_dict', + action='store_true', + required=False, + default=False, + help='if use other dict , (default: False)') + parser.add_argument( '--rank_num', type=int, @@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): py_reader.decorate_tensor_provider( convert_python_to_tensor(args.batch_size, reader.train((args.with_hs or ( - not args.with_nce))), (args.with_hs or ( - not args.with_nce)))) + not args.with_nce)), args.with_other_dict), + (args.with_hs or (not args.with_nce)))) place = fluid.CPUPlace() -- GitLab