diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py index 00239c6009b8503cf445d9847abde92db12db2fe..0095c6f7272a2191ea39e042a836f7d6038032aa 100755 --- a/demo/sentiment/predict.py +++ b/demo/sentiment/predict.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +import os, sys import numpy as np from optparse import OptionParser from py_paddle import swig_paddle, DataProviderConverter @@ -66,35 +66,27 @@ class SentimentPrediction(): for v in open(label_file, 'r'): self.label[int(v.split('\t')[1])] = v.split('\t')[0] - def get_data(self, data_file): + def get_index(self, data): """ - Get input data of paddle format. + transform word into integer index according to the dictionary. """ - with open(data_file, 'r') as fdata: - for line in fdata: - words = line.strip().split() - word_slot = [ - self.word_dict[w] for w in words if w in self.word_dict - ] - if not word_slot: - print "all words are not in dictionary: %s", line - continue - yield [word_slot] + words = data.strip().split() + word_slot = [ + self.word_dict[w] for w in words if w in self.word_dict + ] + return word_slot - def predict(self, data_file): - """ - data_file: file name of input data. - """ - input = self.converter(self.get_data(data_file)) + def batch_predict(self, data_batch): + input = self.converter(data_batch) output = self.network.forwardTest(input) prob = output[0]["value"] - lab = np.argsort(-prob) - if self.label is None: - print("%s: predicting label is %d" % (data_file, lab[0][0])) - else: - print("%s: predicting label is %s" % - (data_file, self.label[lab[0][0]])) - + labs = np.argsort(-prob) + for idx, lab in enumerate(labs): + if self.label is None: + print("predicting label is %d" % (lab[0])) + else: + print("predicting label is %s" % + (self.label[lab[0]])) def option_parser(): usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " @@ -119,11 +111,13 @@ def option_parser(): default=None, help="dictionary file") parser.add_option( - "-i", - "--data", + "-c", + "--batch_size", + type="int", action="store", - dest="data", - help="data file to predict") + dest="batch_size", + default=1, + help="the batch size for prediction") parser.add_option( "-w", "--model", @@ -137,14 +131,21 @@ def option_parser(): def main(): options, args = option_parser() train_conf = options.train_conf - data = options.data + batch_size = options.batch_size dict_file = options.dict_file model_path = options.model_path label = options.label swig_paddle.initPaddle("--use_gpu=0") predict = SentimentPrediction(train_conf, dict_file, model_path, label) - predict.predict(data) + batch = [] + for line in sys.stdin: + batch.append([predict.get_index(line)]) + if len(batch) == batch_size: + predict.batch_predict(batch) + batch=[] + if len(batch) > 0: + predict.batch_predict(batch) if __name__ == '__main__': main() diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh index a889dfe3ec6635bd1ab2b60ae7207815cd205416..c72a8e8641516543ef267fcb4b448630246d1e8d 100755 --- a/demo/sentiment/predict.sh +++ b/demo/sentiment/predict.sh @@ -19,9 +19,9 @@ set -e model=model_output/pass-00002/ config=trainer_config.py label=data/pre-imdb/labels.list -python predict.py \ - -n $config\ - -w $model \ - -b $label \ - -d ./data/pre-imdb/dict.txt \ - -i ./data/aclImdb/test/pos/10007_10.txt +cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ + --tconf=$config\ + --model=$model \ + --label=$label \ + --dict=./data/pre-imdb/dict.txt \ + --batch_size=1 diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md index c53952c544de9fa88a6318432e34b0d05b149445..bb7681db44ca6f286ad6935ddfecb9becb429192 100644 --- a/doc/tutorials/sentiment_analysis/index_en.md +++ b/doc/tutorials/sentiment_analysis/index_en.md @@ -293,20 +293,21 @@ predict.sh: model=model_output/pass-00002/ config=trainer_config.py label=data/pre-imdb/labels.list -python predict.py \ - -n $config\ - -w $model \ - -b $label \ - -d data/pre-imdb/dict.txt \ - -i data/aclImdb/test/pos/10007_10.txt -``` - -* `predict.py`: predicting interface. -* -n $config : set network configure. -* -w $model: set model path. -* -b $label: set dictionary about corresponding relation between integer label and string label. -* -d data/pre-imdb/dict.txt: set dictionary. -* -i data/aclImdb/test/pos/10014_7.txt: set one example file to predict. +cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ + --tconf=$config\ + --model=$model \ + --label=$label \ + --dict=./data/pre-imdb/dict.txt \ + --batch_size=1 +``` + +* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample. +* `predict.py` : predicting interface. +* `--tconf=$config` : set network configure. +* ` --model=$model` : set model path. +* `--label=$label` : set dictionary about corresponding relation between integer label and string label. +* `--dict=data/pre-imdb/dict.txt` : set dictionary. +* `--batch_size=1` : set batch size. Note you should make sure the default model path `model_output/pass-00002` exists or change the model path. diff --git a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md b/doc_cn/demo/sentiment_analysis/sentiment_analysis.md index b70f2d59675615c26b29932cdf99d728bb206148..ba307e97e3010629548460e25e894d082a6ddd4e 100644 --- a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md +++ b/doc_cn/demo/sentiment_analysis/sentiment_analysis.md @@ -291,20 +291,21 @@ predict.sh: model=model_output/pass-00002/ config=trainer_config.py label=data/pre-imdb/labels.list -python predict.py \ - -n $config\ - -w $model \ - -b $label \ - -d data/pre-imdb/dict.txt \ - -i data/aclImdb/test/pos/10007_10.txt -``` - -* `predict.py`: 预测接口脚本。 -* -n $config : 设置网络配置。 -* -w $model: 设置模型路径。 -* -b $label: 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。 -* -d data/pre-imdb/dict.txt: 设置字典文件。 -* -i data/aclImdb/test/pos/10014_7.txt: 设置一个要预测的示例文件。 +cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ + --tconf=$config\ + --model=$model \ + --label=$label \ + --dict=./data/pre-imdb/dict.txt \ + --batch_size=1 +``` + +* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。 +* `predict.py` : 预测接口脚本。 +* `--tconf=$config` : 设置网络配置。 +* `--model=$model` : 设置模型路径。 +* `--label=$label` : 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。 +* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。 +* `--batch_size=1` : 设置batch size。 注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。