From 28368f7d9c452b0c3a0c69b9b6b53b407d08358b Mon Sep 17 00:00:00 2001 From: barrierye Date: Fri, 10 Apr 2020 21:00:52 +0800 Subject: [PATCH] add imdb_reader --- ensemble-demo/imdb_reader.py | 92 ++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 ensemble-demo/imdb_reader.py diff --git a/ensemble-demo/imdb_reader.py b/ensemble-demo/imdb_reader.py new file mode 100644 index 00000000..a4ef3e16 --- /dev/null +++ b/ensemble-demo/imdb_reader.py @@ -0,0 +1,92 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# pylint: disable=doc-string-missing + +import sys +import os +import paddle +import re +import paddle.fluid.incubate.data_generator as dg + +py_version = sys.version_info[0] + + +class IMDBDataset(dg.MultiSlotDataGenerator): + def load_resource(self, dictfile): + self._vocab = {} + wid = 0 + if py_version == 2: + with open(dictfile) as f: + for line in f: + self._vocab[line.strip()] = wid + wid += 1 + else: + with open(dictfile, encoding="utf-8") as f: + for line in f: + self._vocab[line.strip()] = wid + wid += 1 + self._unk_id = len(self._vocab) + self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))') + self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0]) + + def get_words_only(self, line): + sent = line.lower().replace("
", " ").strip() + words = [x for x in self._pattern.split(sent) if x and x != " "] + feas = [ + self._vocab[x] if x in self._vocab else self._unk_id for x in words + ] + return feas + + def get_words_and_label(self, line): + send = '|'.join(line.split('|')[:-1]).lower().replace("
", + " ").strip() + label = [int(line.split('|')[-1])] + + words = [x for x in self._pattern.split(send) if x and x != " "] + feas = [ + self._vocab[x] if x in self._vocab else self._unk_id for x in words + ] + return feas, label + + def infer_reader(self, infer_filelist, batch, buf_size): + def local_iter(): + for fname in infer_filelist: + with open(fname, "r") as fin: + for line in fin: + feas, label = self.get_words_and_label(line) + yield feas, label + + import paddle + batch_iter = paddle.batch( + paddle.reader.shuffle( + local_iter, buf_size=buf_size), + batch_size=batch) + return batch_iter + + def generate_sample(self, line): + def memory_iter(): + for i in range(1000): + yield self.return_value + + def data_iter(): + feas, label = self.get_words_and_label(line) + yield ("words", feas), ("label", label) + + return data_iter + + +if __name__ == "__main__": + imdb = IMDBDataset() + imdb.load_resource("imdb.vocab") + imdb.run_from_stdin() -- GitLab