未验证 提交 aea40b82 编写于 作者: J Jiabin Yang 提交者: GitHub

Merge pull request #1624 from JiabinYang/fix_preprocess_without_3rd_part_dict

fix_preprocess_without_3rd_part_dict
......@@ -25,7 +25,14 @@ cd data && ./download.sh && cd ..
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
如果您想使用自定义的词典形如:
```bash
<UNK>
a
b
c
```
请将--other_dict_path设置为您存放将使用的词典的目录,并设置--with_other_dict使用它
## 训练
训练的命令行选项可以通过`python train.py -h`列出。
......@@ -40,6 +47,14 @@ python train.py \
--with_hs --with_nce --is_local \
2>&1 | tee train.log
```
如果您想使用自定义的词典形如:
```bash
<UNK>
a
b
c
```
请将--other_dict_path设置为您存放将使用的词典的目录,并设置--with_other_dict使用它
### 分布式训练
......
......@@ -29,9 +29,16 @@ This model implement a skip-gram model of word2vector.
Preprocess the training data to generate a word dict.
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
if you would like to use your own vocab follow the format below:
```bash
<UNK>
a
b
c
```
Then, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
## Train
......@@ -47,7 +54,8 @@ python train.py \
--with_hs --with_nce --is_local \
2>&1 | tee train.log
```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
### Distributed Train
Run a 2 pserver 2 trainer distribute training on a single machine.
......
......@@ -27,12 +27,6 @@ def parse_args():
type=int,
default=5,
help="If the word count is less then freq, it will be removed from dict")
parser.add_argument(
'--is_local',
action='store_true',
required=False,
default=False,
help='Local train or not, (default: False)')
parser.add_argument(
'--with_other_dict',
......@@ -203,26 +197,27 @@ def preprocess(args):
for line in f:
word_count[native_to_unicode(line.strip())] = 1
if args.is_local:
for i in range(1, 100):
with io.open(
args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
encoding='utf-8') as f:
for line in f:
for i in range(1, 100):
with io.open(
args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
encoding='utf-8') as f:
for line in f:
if args.with_other_dict:
line = strip_lines(line)
words = line.split()
if args.with_other_dict:
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[native_to_unicode('<UNK>')] += 1
else:
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[native_to_unicode('<UNK>')] += 1
else:
line = text_strip(line)
words = line.split()
for item in words:
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
item_to_remove = []
for item in word_count:
if word_count[item] <= args.freq:
......
......@@ -105,7 +105,7 @@ class Word2VecReader(object):
return set(targets)
def train(self, with_hs):
def train(self, with_hs, with_other_dict):
def _reader():
for file in self.filelist:
with io.open(
......@@ -116,7 +116,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......@@ -140,7 +144,11 @@ class Word2VecReader(object):
count = 1
for line in f:
if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line, self.word_count)
if with_other_dict:
line = preprocess.strip_lines(line,
self.word_count)
else:
line = preprocess.text_strip(line)
word_ids = [
self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_
......
......@@ -116,6 +116,13 @@ def parse_args():
default=False,
help='Do inference every 100 batches , (default: False)')
parser.add_argument(
'--with_other_dict',
action='store_true',
required=False,
default=False,
help='if use other dict , (default: False)')
parser.add_argument(
'--rank_num',
type=int,
......@@ -161,8 +168,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
not args.with_nce)), args.with_other_dict),
(args.with_hs or (not args.with_nce))))
place = fluid.CPUPlace()
......@@ -261,7 +268,7 @@ def train(args):
args.dict_path, args.train_data_path, filelist, 0, 1)
else:
trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
trainers = int(os.environ["PADDLE_TRAINERS"])
trainer_num = int(os.environ["PADDLE_TRAINERS"])
word2vec_reader = reader.Word2VecReader(args.dict_path,
args.train_data_path, filelist,
trainer_id, trainer_num)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册