From 871c7167c84e5f735ced372ee9c08aefe702702b Mon Sep 17 00:00:00 2001 From: guru4elephant Date: Tue, 14 Jan 2020 18:28:59 +0800 Subject: [PATCH] add imdb training example and save api --- python/examples/imdb/imdb_reader.py | 70 +++++++++++++++ python/examples/imdb/local_train.py | 68 +++++++++++++++ python/examples/imdb/nets.py | 125 +++++++++++++++++++++++++++ python/paddle_serving/io/__init__.py | 39 +++++++++ 4 files changed, 302 insertions(+) create mode 100644 python/examples/imdb/imdb_reader.py create mode 100644 python/examples/imdb/local_train.py create mode 100644 python/examples/imdb/nets.py create mode 100644 python/paddle_serving/io/__init__.py diff --git a/python/examples/imdb/imdb_reader.py b/python/examples/imdb/imdb_reader.py new file mode 100644 index 00000000..def7ce21 --- /dev/null +++ b/python/examples/imdb/imdb_reader.py @@ -0,0 +1,70 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import paddle +import re +import paddle.fluid.incubate.data_generator as dg + +class IMDBDataset(dg.MultiSlotDataGenerator): + def load_resource(self, dictfile): + self._vocab = {} + wid = 0 + with open(dictfile) as f: + for line in f: + self._vocab[line.strip()] = wid + wid += 1 + self._unk_id = len(self._vocab) + self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))') + self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0]) + + def get_words_and_label(self, line): + send = '|'.join(line.split('|')[:-1]).lower().replace("
", + " ").strip() + label = [int(line.split('|')[-1])] + + words = [x for x in self._pattern.split(send) if x and x != " "] + feas = [ + self._vocab[x] if x in self._vocab else self._unk_id for x in words + ] + return feas, label + + def infer_reader(self, infer_filelist, batch, buf_size): + def local_iter(): + for fname in infer_filelist: + with open(fname, "r") as fin: + for line in fin: + feas, label = self.get_words_and_label(line) + yield feas, label + import paddle + batch_iter = paddle.batch( + paddle.reader.shuffle(local_iter, buf_size=buf_size), + batch_size=batch) + return batch_iter + + def generate_sample(self, line): + def memory_iter(): + for i in range(1000): + yield self.return_value + def data_iter(): + feas, label = self.get_words_and_label(line) + yield ("words", feas), ("label", label) + return data_iter + +if __name__ == "__main__": + imdb = IMDBDataset() + imdb.load_resource("imdb.vocab") + imdb.run_from_stdin() + diff --git a/python/examples/imdb/local_train.py b/python/examples/imdb/local_train.py new file mode 100644 index 00000000..9cf7e340 --- /dev/null +++ b/python/examples/imdb/local_train.py @@ -0,0 +1,68 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import paddle +import logging +import paddle.fluid as fluid +import paddle_serving as serving + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) + +def load_vocab(filename): + vocab = {} + with open(filename) as f: + wid = 0 + for line in f: + vocab[line.strip()] = wid + wid += 1 + vocab[""] = len(vocab) + return vocab + +if __name__ == "__main__": + vocab = load_vocab('imdb.vocab') + dict_dim = len(vocab) + + data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + dataset = fluid.DatasetFactory().create_dataset() + filelist = ["train_data/%s" % x for x in os.listdir("train_data")] + dataset.set_use_var([data, label]) + pipe_command = "python imdb_reader.py" + dataset.set_pipe_command(pipe_command) + dataset.set_batch_size(4) + dataset.set_filelist(filelist) + dataset.set_thread(10) + from nets import cnn_net + avg_cost, acc, prediction = cnn_net(data, label, dict_dim) + optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer.minimize(avg_cost) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + epochs = 30 + save_dirname = "cnn_model" + for i in range(epochs): + exe.train_from_dataset(program=fluid.default_main_program(), + dataset=dataset, debug=False) + logger.info("TRAIN --> pass: {}".format(i)) + fluid.io.save_inference_model("%s/epoch%d.model" % (save_dirname, i), + [data.name, label.name], [acc], exe) + serving.io.save_model("%s/epoch%d.model" % (save_dirname, i), + ["words", "label"], {"acc": acc}, exe) + + diff --git a/python/examples/imdb/nets.py b/python/examples/imdb/nets.py new file mode 100644 index 00000000..3b451d16 --- /dev/null +++ b/python/examples/imdb/nets.py @@ -0,0 +1,125 @@ +import sys +import time +import numpy as np + +import paddle +import paddle.fluid as fluid + + +def bow_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + bow net + """ + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim], is_sparse=True) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def cnn_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + win_size=3): + """ + conv net + """ + emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim], is_sparse=True) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=hid_dim, + filter_size=win_size, + act="tanh", + pool_type="max") + + fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) + + prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + """ + lstm net + """ + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr), + is_sparse=True) + + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + """ + gru net + """ + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return avg_cost, acc, prediction diff --git a/python/paddle_serving/io/__init__.py b/python/paddle_serving/io/__init__.py new file mode 100644 index 00000000..b2a9c219 --- /dev/null +++ b/python/paddle_serving/io/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import Executor +from paddle.fluid.compiler import CompiledProgram +from paddle.fluid.framework import Program + +def save_model(server_model_folder, + client_config_folder, + feed_var_dict, + fetch_var_dict, + main_program=None): + if main_program is None: + main_program = default_main_program() + elif isinstance(main_program, CompiledProgram): + main_program = main_program._program + if main_program is None: + raise TypeError("program should be as Program type or None") + if not isinstance(main_program, Program): + raise TypeError("program should be as Program type or None") + + executor = Executor(place=paddle.fluid.CPUPlace()) + paddle.fluid.io.save_persistables(executor, server_model_folder, + main_program) + + + + -- GitLab