提交 075d74cf 编写于 作者: J JiabinYang

fix_preprocess_without_3rd_part_dict

上级 62148b78
......@@ -23,7 +23,7 @@ cd data && ./download.sh && cd ..
对数据进行预处理以生成一个词典。
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict --is_local
```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
......@@ -40,6 +40,7 @@ python train.py \
--with_hs --with_nce --is_local \
2>&1 | tee train.log
```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
### 分布式训练
......
......@@ -47,7 +47,8 @@ python train.py \
--with_hs --with_nce --is_local \
2>&1 | tee train.log
```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
### Distributed Train
Run a 2 pserver 2 trainer distribute training on a single machine.
......
......@@ -209,15 +209,17 @@ def preprocess(args):
args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
encoding='utf-8') as f:
for line in f:
if args.with_other_dict:
line = strip_lines(line)
words = line.split()
if args.with_other_dict:
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[native_to_unicode('<UNK>')] += 1
else:
line = text_strip(line)
words = line.split()
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
......
......@@ -105,7 +105,7 @@ class Word2VecReader(object):
return set(targets)
def train(self, with_hs):
def train(self, with_hs, with_other_dict):
def _reader():
for file in self.filelist:
with io.open(
......@@ -116,7 +116,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......@@ -140,7 +144,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......
......@@ -116,6 +116,13 @@ def parse_args():
default=False,
help='Do inference every 100 batches , (default: False)')
parser.add_argument(
'--with_other_dict',
action='store_true',
required=False,
default=False,
help='if use other dict , (default: False)')
parser.add_argument(
'--rank_num',
type=int,
......@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
not args.with_nce)), args.with_other_dict),
(args.with_hs or (not args.with_nce))))
place = fluid.CPUPlace()
......@@ -261,7 +268,7 @@ def train(args):
args.dict_path, args.train_data_path, filelist, 0, 1)
else:
trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
trainers = int(os.environ["PADDLE_TRAINERS"])
trainer_num = int(os.environ["PADDLE_TRAINERS"])
word2vec_reader = reader.Word2VecReader(args.dict_path,
args.train_data_path, filelist,
trainer_id, trainer_num)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册