提交 b8555a93 编写于 作者: J JiabinYang

fix preprocess

上级 28b8943b
...@@ -45,12 +45,15 @@ python train.py \ ...@@ -45,12 +45,15 @@ python train.py \
--with_nce --is_local \ --with_nce --is_local \
2>&1 | tee train.log 2>&1 | tee train.log
``` ```
使用async executor 使用async executor
```bash ```bash
python async_train.py --train_data_path ./async_data/ \ python async_train.py --train_data_path ./async_data/ \
--dict_path data/1-billion_dict --with_nce --with_hs \ --dict_path data/1-billion_dict --with_nce --with_hs \
--epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
``` ```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
### 分布式训练 ### 分布式训练
本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率 本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率
......
...@@ -53,6 +53,7 @@ python train.py \ ...@@ -53,6 +53,7 @@ python train.py \
--with_nce --is_local \ --with_nce --is_local \
2>&1 | tee train.log 2>&1 | tee train.log
``` ```
with async executor with async executor
```bash ```bash
python async_train.py --train_data_path ./async_data/ \ python async_train.py --train_data_path ./async_data/ \
...@@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \ ...@@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \
--epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log --epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
``` ```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
### Distributed Train ### Distributed Train
Run a 2 pserver 2 trainer distribute training on a single machine. Run a 2 pserver 2 trainer distribute training on a single machine.
In distributed training setting, training data is splited by trainer_id, so that training data In distributed training setting, training data is splited by trainer_id, so that training data
......
...@@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num): ...@@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num):
def train_loop(args, train_program, reader, py_reader, loss, trainer_id): def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
reader.train((args.with_hs or (not args.with_nce))), reader.train((args.with_hs or (not args.with_nce)),
args.with_other_dict),
buf_size=args.batch_size * 100), buf_size=args.batch_size * 100),
batch_size=args.batch_size) batch_size=args.batch_size)
......
...@@ -105,7 +105,7 @@ class Word2VecReader(object): ...@@ -105,7 +105,7 @@ class Word2VecReader(object):
return set(targets) return set(targets)
def train(self, with_hs): def train(self, with_hs, with_other_dict):
def _reader(): def _reader():
for file in self.filelist: for file in self.filelist:
with io.open( with io.open(
...@@ -116,7 +116,11 @@ class Word2VecReader(object): ...@@ -116,7 +116,11 @@ class Word2VecReader(object):
count = 1 count = 1
for line in f: for line in f:
if self.trainer_id == count % self.trainer_num: if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count) if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [ word_ids = [
self.word_to_id_[word] for word in line.split() self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_ if word in self.word_to_id_
...@@ -140,7 +144,11 @@ class Word2VecReader(object): ...@@ -140,7 +144,11 @@ class Word2VecReader(object):
count = 1 count = 1
for line in f: for line in f:
if self.trainer_id == count % self.trainer_num: if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count) if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [ word_ids = [
self.word_to_id_[word] for word in line.split() self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_ if word in self.word_to_id_
......
...@@ -116,6 +116,13 @@ def parse_args(): ...@@ -116,6 +116,13 @@ def parse_args():
default=False, default=False,
help='Do inference every 100 batches , (default: False)') help='Do inference every 100 batches , (default: False)')
parser.add_argument(
'--with_other_dict',
action='store_true',
required=False,
default=False,
help='if use other dict , (default: False)')
parser.add_argument( parser.add_argument(
'--rank_num', '--rank_num',
type=int, type=int,
...@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): ...@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
py_reader.decorate_tensor_provider( py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size, convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or ( reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or ( not args.with_nce)), args.with_other_dict),
not args.with_nce)))) (args.with_hs or (not args.with_nce))))
place = fluid.CPUPlace() place = fluid.CPUPlace()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册