提交 b8555a93 编写于 作者: J JiabinYang

fix preprocess

上级 28b8943b
......@@ -45,12 +45,15 @@ python train.py \
--with_nce --is_local \
2>&1 | tee train.log
```
使用async executor
```bash
python async_train.py --train_data_path ./async_data/ \
--dict_path data/1-billion_dict --with_nce --with_hs \
--epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
### 分布式训练
本地启动一个2 trainer 2 pserver的分布式训练任务,分布式场景下训练数据会按照trainer的id进行切分,保证trainer之间的训练数据不会重叠,提高训练效率
......
......@@ -53,6 +53,7 @@ python train.py \
--with_nce --is_local \
2>&1 | tee train.log
```
with async executor
```bash
python async_train.py --train_data_path ./async_data/ \
......@@ -60,6 +61,10 @@ python async_train.py --train_data_path ./async_data/ \
--epochs 1 --thread_num 1 --is_sparse --batch_size 100 --is_local 2>&1 | tee async_trainer1.log
```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
### Distributed Train
Run a 2 pserver 2 trainer distribute training on a single machine.
In distributed training setting, training data is splited by trainer_id, so that training data
......
......@@ -174,7 +174,8 @@ def async_train_loop(args, train_program, dataset, loss, thread_num):
def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train((args.with_hs or (not args.with_nce))),
reader.train((args.with_hs or (not args.with_nce)),
args.with_other_dict),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
......
......@@ -105,7 +105,7 @@ class Word2VecReader(object):
return set(targets)
def train(self, with_hs):
def train(self, with_hs, with_other_dict):
def _reader():
for file in self.filelist:
with io.open(
......@@ -116,7 +116,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......@@ -140,7 +144,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......
......@@ -116,6 +116,13 @@ def parse_args():
default=False,
help='Do inference every 100 batches , (default: False)')
parser.add_argument(
'--with_other_dict',
action='store_true',
required=False,
default=False,
help='if use other dict , (default: False)')
parser.add_argument(
'--rank_num',
type=int,
......@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
not args.with_nce)), args.with_other_dict),
(args.with_hs or (not args.with_nce))))
place = fluid.CPUPlace()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册