diff --git a/models/contentunderstanding/classification/__init__.py b/models/contentunderstanding/classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/models/contentunderstanding/classification/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/models/contentunderstanding/classification/config.yaml b/models/contentunderstanding/classification/config.yaml index ef55cd18e8fd45829acd2f479c661f27decfda71..9e0bdd1e851ada704eb2377efe0a82154fd2b371 100644 --- a/models/contentunderstanding/classification/config.yaml +++ b/models/contentunderstanding/classification/config.yaml @@ -12,28 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" +workspace: "paddlerec.models.contentunderstanding.classification" - epochs: 10 - workspace: "paddlerec.models.contentunderstanding.classification" +dataset: +- name: data1 + batch_size: 5 + type: DataLoader + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" + +hyper_parameters: + optimizer: + class: Adagrad + learning_rate: 0.001 + is_sparse: False - reader: - batch_size: 5 - class: "{workspace}/reader.py" - train_data_path: "{workspace}/train_data" +mode: runner1 - model: - models: "{workspace}/model.py" +runner: +- name: runner1 + class: single_train + epochs: 10 + device: cpu + save_checkpoint_interval: 2 + save_inference_interval: 4 + save_checkpoint_path: "increment" + save_inference_path: "inference" + save_inference_feed_varnames: [] + save_inference_fetch_varnames: [] - save: - increment: - dirname: "increment" - epoch_interval: 1 - save_last: True - inference: - dirname: "inference" - epoch_interval: 100 - save_last: True +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: data1 + thread_num: 1 diff --git a/models/contentunderstanding/classification/train_data/part-0 b/models/contentunderstanding/classification/data/train_data/part-0.txt similarity index 100% rename from models/contentunderstanding/classification/train_data/part-0 rename to models/contentunderstanding/classification/data/train_data/part-0.txt diff --git a/models/contentunderstanding/classification/model.py b/models/contentunderstanding/classification/model.py index 23c51d44d7d839d9db30f8129c3e42449a6a80d4..ce9caf5bfa6c5bc229a52d09c5c8f3b6093b80c6 100644 --- a/models/contentunderstanding/classification/model.py +++ b/models/contentunderstanding/classification/model.py @@ -27,19 +27,27 @@ class Model(ModelBase): self.emb_dim = 8 self.hid_dim = 128 self.class_dim = 2 + self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse", + False) - def train_net(self): - """ network definition """ - + def input_data(self, is_infer=False, **kwargs): data = fluid.data( name="input", shape=[None, self.max_len], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') + return [data, label, seq_len] - self._data_var = [data, label, seq_len] + def net(self, input, is_infer=False): + """ network definition """ + data = input[0] + label = input[1] + seq_len = input[2] # embedding layer - emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim]) + emb = fluid.embedding( + input=data, + size=[self.dict_dim, self.emb_dim], + is_sparse=self.is_sparse) emb = fluid.layers.sequence_unpad(emb, length=seq_len) # convolution layer conv = fluid.nets.sequence_conv_pool( @@ -59,19 +67,8 @@ class Model(ModelBase): avg_cost = fluid.layers.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) - self.cost = avg_cost - self._metrics["acc"] = acc - - def get_avg_cost(self): - return self.cost - - def get_metrics(self): - return self._metrics - - def optimizer(self): - learning_rate = 0.01 - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) - return sgd_optimizer - - def infer_net(self): - self.train_net() + self._cost = avg_cost + if is_infer: + self._infer_results["acc"] = acc + else: + self._metrics["acc"] = acc diff --git a/models/contentunderstanding/classification/reader.py b/models/contentunderstanding/classification/reader.py index 1c8e86cdb49f1cc89c9c4f413cbd7b117b55aa55..18a41ee844c35d0a6fa37a835203121868158c4e 100644 --- a/models/contentunderstanding/classification/reader.py +++ b/models/contentunderstanding/classification/reader.py @@ -22,7 +22,7 @@ class TrainReader(Reader): pass def _process_line(self, l): - l = l.strip().split(" ") + l = l.strip().split() data = l[0:10] seq_len = l[10:11] label = l[11:] @@ -37,8 +37,6 @@ class TrainReader(Reader): data = [int(i) for i in data] label = [int(i) for i in label] seq_len = [int(i) for i in seq_len] - print >> sys.stderr, str( - [('data', data), ('label', label), ('seq_len', seq_len)]) yield [('data', data), ('label', label), ('seq_len', seq_len)] return data_iter diff --git a/models/contentunderstanding/readme.md b/models/contentunderstanding/readme.md index deefbd2eb02f08d7fac810eb40ae78ff1a173baf..217d7124d7cdb481ca7aacb418e36148508e42b8 100644 --- a/models/contentunderstanding/readme.md +++ b/models/contentunderstanding/readme.md @@ -37,7 +37,18 @@

-## 使用教程 +##使用教程(快速开始) +``` +python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace +python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification +``` + +## 使用教程(复现论文) + +###注意 + +为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。 + ### 数据处理 **(1)TagSpace** @@ -64,20 +75,42 @@ mv test.csv raw_big_test_data python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt ``` -**(2)Classification** +### 训练 +``` +cd modles/contentunderstanding/tagspace +python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置 +``` -无 +### 预测 +``` +# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 +# 修改对应模型的config.yaml,mode配置infer_runner +# 示例: mode: train_runner -> mode: infer_runner +# infer_runner中 class配置为 class: single_infer +# 修改phase阶段为infer的配置,参照config注释 + +# 修改完config.yaml后 执行: +python -m paddlerec.run -m ./config.yaml +``` -### 训练 +**(2)Classification** +### 训练 ``` -python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification +cd modles/contentunderstanding/classification +python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置 ``` ### 预测 - ``` -python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification +# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径 +# 修改对应模型的config.yaml,mode配置infer_runner +# 示例: mode: train_runner -> mode: infer_runner +# infer_runner中 class配置为 class: single_infer +# 修改phase阶段为infer的配置,参照config注释 + +# 修改完config.yaml后 执行: +python -m paddlerec.run -m ./config.yaml ``` ## 效果对比 diff --git a/models/contentunderstanding/tagspace/config.yaml b/models/contentunderstanding/tagspace/config.yaml index 19fbf277d66445c44287856512cb0b13777dc251..8ca28f2977dd4bfd382e250e5c6513b156360404 100644 --- a/models/contentunderstanding/tagspace/config.yaml +++ b/models/contentunderstanding/tagspace/config.yaml @@ -12,38 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -train: - trainer: - # for cluster training - strategy: "async" +workspace: "paddlerec.models.contentunderstanding.tagspace" - epochs: 10 - workspace: "paddlerec.models.contentunderstanding.tagspace" +dataset: +- name: sample_1 + type: QueueDataset + batch_size: 5 + data_path: "{workspace}/data/train_data" + data_converter: "{workspace}/reader.py" - reader: - batch_size: 5 - class: "{workspace}/reader.py" - train_data_path: "{workspace}/train_data" +hyper_parameters: + optimizer: + class: Adagrad + learning_rate: 0.001 + vocab_text_size: 11447 + vocab_tag_size: 4 + emb_dim: 10 + hid_dim: 1000 + win_size: 5 + margin: 0.1 + neg_size: 3 + num_devices: 1 - model: - models: "{workspace}/model.py" - hyper_parameters: - vocab_text_size: 11447 - vocab_tag_size: 4 - emb_dim: 10 - hid_dim: 1000 - win_size: 5 - margin: 0.1 - neg_size: 3 - num_devices: 1 +mode: runner1 +runner: +- name: runner1 + class: single_train + epochs: 10 + device: cpu + save_checkpoint_interval: 2 + save_inference_interval: 4 + save_checkpoint_path: "increment" + save_inference_path: "inference" + save_inference_feed_varnames: [] + save_inference_fetch_varnames: [] - save: - increment: - dirname: "increment" - epoch_interval: 1 - save_last: True - inference: - dirname: "inference" - epoch_interval: 100 - save_last: True +phase: +- name: phase1 + model: "{workspace}/model.py" + dataset_name: sample_1 + thread_num: 1 diff --git a/models/contentunderstanding/tagspace/test_data/small_test.csv b/models/contentunderstanding/tagspace/data/test_data/small_test.csv similarity index 100% rename from models/contentunderstanding/tagspace/test_data/small_test.csv rename to models/contentunderstanding/tagspace/data/test_data/small_test.csv diff --git a/models/contentunderstanding/tagspace/train_data/small_train.csv b/models/contentunderstanding/tagspace/data/train_data/small_train.csv similarity index 100% rename from models/contentunderstanding/tagspace/train_data/small_train.csv rename to models/contentunderstanding/tagspace/data/train_data/small_train.csv diff --git a/models/contentunderstanding/tagspace/model.py b/models/contentunderstanding/tagspace/model.py index 2948d2e3d5f4a5d5afdbb9744f235b5db59e6bae..34e5ebace1c3f4a44def5ad8006c2eb74a40c3c0 100644 --- a/models/contentunderstanding/tagspace/model.py +++ b/models/contentunderstanding/tagspace/model.py @@ -26,26 +26,30 @@ class Model(ModelBase): ModelBase.__init__(self, config) self.cost = None self.metrics = {} - self.vocab_text_size = envs.get_global_env("vocab_text_size", None, - self._namespace) - self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, - self._namespace) - self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) - self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) - self.win_size = envs.get_global_env("win_size", None, self._namespace) - self.margin = envs.get_global_env("margin", None, self._namespace) - self.neg_size = envs.get_global_env("neg_size", None, self._namespace) + self.vocab_text_size = envs.get_global_env( + "hyper_parameters.vocab_text_size") + self.vocab_tag_size = envs.get_global_env( + "hyper_parameters.vocab_tag_size") + self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim") + self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim") + self.win_size = envs.get_global_env("hyper_parameters.win_size") + self.margin = envs.get_global_env("hyper_parameters.margin") + self.neg_size = envs.get_global_env("hyper_parameters.neg_size") - def train_net(self): - """ network""" + def input_data(self, is_infer=False, **kwargs): text = fluid.data( name="text", shape=[None, 1], lod_level=1, dtype='int64') pos_tag = fluid.data( name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') neg_tag = fluid.data( name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64') + return [text, pos_tag, neg_tag] - self._data_var = [text, pos_tag, neg_tag] + def net(self, input, is_infer=False): + """ network""" + text = input[0] + pos_tag = input[1] + neg_tag = input[2] text_emb = fluid.embedding( input=text, @@ -97,22 +101,11 @@ class Model(ModelBase): avg_cost = nn.mean(loss_part3) less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) - self.cost = avg_cost - - self.metrics["correct"] = correct - self.metrics["cos_pos"] = cos_pos - - def get_avg_cost(self): - return self.cost - - def get_metrics(self): - return self.metrics - - def optimizer(self): - learning_rate = envs.get_global_env("hyper_parameters.base_lr", None, - self._namespace) - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate) - return sgd_optimizer + self._cost = avg_cost - def infer_net(self, parameter_list): - self.train_net() + if is_infer: + self._infer_results["correct"] = correct + self._infer_results["cos_pos"] = cos_pos + else: + self._metrics["correct"] = correct + self._metrics["cos_pos"] = cos_pos diff --git a/setup.py b/setup.py index d523f40904db2963ece889035616843d95517d05..8ad1cc742434aa39513a1c618b56649c3530686a 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def build(dirname): models_copy = [ 'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*', - 'data/sample_data/infer/*' + 'data/sample_data/infer/*', 'data/*/*.csv' ] engine_copy = ['*/*.sh']