From 41351679720289a0c431546a2ef7d91e3dce5e77 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 28 Dec 2018 05:41:54 +0000 Subject: [PATCH] refine readme and clean code --- fluid/PaddleRec/word2vec/README.cn.md | 1 + fluid/PaddleRec/word2vec/README.md | 3 ++- fluid/PaddleRec/word2vec/data/download.sh | 4 ++++ fluid/PaddleRec/word2vec/infer.py | 2 -- 4 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 fluid/PaddleRec/word2vec/data/download.sh diff --git a/fluid/PaddleRec/word2vec/README.cn.md b/fluid/PaddleRec/word2vec/README.cn.md index 076b3eef..7ed9ddc3 100644 --- a/fluid/PaddleRec/word2vec/README.cn.md +++ b/fluid/PaddleRec/word2vec/README.cn.md @@ -25,6 +25,7 @@ cd data && ./download.sh && cd .. ```bash python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict ``` +如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它 ## 训练 训练的命令行选项可以通过`python train.py -h`列出。 diff --git a/fluid/PaddleRec/word2vec/README.md b/fluid/PaddleRec/word2vec/README.md index 1c5da2a3..01e0696a 100644 --- a/fluid/PaddleRec/word2vec/README.md +++ b/fluid/PaddleRec/word2vec/README.md @@ -31,7 +31,8 @@ Preprocess the training data to generate a word dict. ```bash python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict ``` -if you would like to use our supported third party vocab, please set +if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you +save the vocab you will use and set --with_other_dict flag on to using it. ## Train The command line options for training can be listed by `python train.py -h`. diff --git a/fluid/PaddleRec/word2vec/data/download.sh b/fluid/PaddleRec/word2vec/data/download.sh new file mode 100644 index 00000000..22cde6d9 --- /dev/null +++ b/fluid/PaddleRec/word2vec/data/download.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz +tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz diff --git a/fluid/PaddleRec/word2vec/infer.py b/fluid/PaddleRec/word2vec/infer.py index 9ed42d1c..69844c20 100644 --- a/fluid/PaddleRec/word2vec/infer.py +++ b/fluid/PaddleRec/word2vec/infer.py @@ -2,8 +2,6 @@ import time import os import paddle.fluid as fluid import numpy as np -from Queue import PriorityQueue -import heapq import logging import argparse import preprocess -- GitLab