未验证 提交 319ec2d8 编写于 作者: 1 123malin 提交者: GitHub

Merge branch 'master' into modify_yaml

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -12,28 +12,37 @@ ...@@ -12,28 +12,37 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
train: workspace: "paddlerec.models.contentunderstanding.classification"
trainer:
# for cluster training
strategy: "async"
epochs: 10 dataset:
workspace: "paddlerec.models.contentunderstanding.classification" - name: data1
batch_size: 5
type: DataLoader
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
hyper_parameters:
optimizer:
class: Adagrad
learning_rate: 0.001
is_sparse: False
reader: mode: runner1
batch_size: 5
class: "{workspace}/reader.py"
train_data_path: "{workspace}/train_data"
model: runner:
models: "{workspace}/model.py" - name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save: phase:
increment: - name: phase1
dirname: "increment" model: "{workspace}/model.py"
epoch_interval: 1 dataset_name: data1
save_last: True thread_num: 1
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
...@@ -27,19 +27,27 @@ class Model(ModelBase): ...@@ -27,19 +27,27 @@ class Model(ModelBase):
self.emb_dim = 8 self.emb_dim = 8
self.hid_dim = 128 self.hid_dim = 128
self.class_dim = 2 self.class_dim = 2
self.is_sparse = envs.get_global_env("hyper_parameters.is_sparse",
False)
def train_net(self): def input_data(self, is_infer=False, **kwargs):
""" network definition """
data = fluid.data( data = fluid.data(
name="input", shape=[None, self.max_len], dtype='int64') name="input", shape=[None, self.max_len], dtype='int64')
label = fluid.data(name="label", shape=[None, 1], dtype='int64') label = fluid.data(name="label", shape=[None, 1], dtype='int64')
seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64') seq_len = fluid.data(name="seq_len", shape=[None], dtype='int64')
return [data, label, seq_len]
self._data_var = [data, label, seq_len] def net(self, input, is_infer=False):
""" network definition """
data = input[0]
label = input[1]
seq_len = input[2]
# embedding layer # embedding layer
emb = fluid.embedding(input=data, size=[self.dict_dim, self.emb_dim]) emb = fluid.embedding(
input=data,
size=[self.dict_dim, self.emb_dim],
is_sparse=self.is_sparse)
emb = fluid.layers.sequence_unpad(emb, length=seq_len) emb = fluid.layers.sequence_unpad(emb, length=seq_len)
# convolution layer # convolution layer
conv = fluid.nets.sequence_conv_pool( conv = fluid.nets.sequence_conv_pool(
...@@ -59,19 +67,8 @@ class Model(ModelBase): ...@@ -59,19 +67,8 @@ class Model(ModelBase):
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label) acc = fluid.layers.accuracy(input=prediction, label=label)
self.cost = avg_cost self._cost = avg_cost
self._metrics["acc"] = acc if is_infer:
self._infer_results["acc"] = acc
def get_avg_cost(self): else:
return self.cost self._metrics["acc"] = acc
def get_metrics(self):
return self._metrics
def optimizer(self):
learning_rate = 0.01
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
def infer_net(self):
self.train_net()
...@@ -22,7 +22,7 @@ class TrainReader(Reader): ...@@ -22,7 +22,7 @@ class TrainReader(Reader):
pass pass
def _process_line(self, l): def _process_line(self, l):
l = l.strip().split(" ") l = l.strip().split()
data = l[0:10] data = l[0:10]
seq_len = l[10:11] seq_len = l[10:11]
label = l[11:] label = l[11:]
...@@ -37,8 +37,6 @@ class TrainReader(Reader): ...@@ -37,8 +37,6 @@ class TrainReader(Reader):
data = [int(i) for i in data] data = [int(i) for i in data]
label = [int(i) for i in label] label = [int(i) for i in label]
seq_len = [int(i) for i in seq_len] seq_len = [int(i) for i in seq_len]
print >> sys.stderr, str(
[('data', data), ('label', label), ('seq_len', seq_len)])
yield [('data', data), ('label', label), ('seq_len', seq_len)] yield [('data', data), ('label', label), ('seq_len', seq_len)]
return data_iter return data_iter
...@@ -37,7 +37,18 @@ ...@@ -37,7 +37,18 @@
<img align="center" src="../../doc/imgs/cnn-ckim2014.png"> <img align="center" src="../../doc/imgs/cnn-ckim2014.png">
<p> <p>
## 使用教程 ##使用教程(快速开始)
```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.tagspace
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification
```
## 使用教程(复现论文)
###注意
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果请使用以下提供的脚本下载对应数据集以及数据预处理。
### 数据处理 ### 数据处理
**(1)TagSpace** **(1)TagSpace**
...@@ -64,20 +75,42 @@ mv test.csv raw_big_test_data ...@@ -64,20 +75,42 @@ mv test.csv raw_big_test_data
python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt python text2paddle.py raw_big_train_data/ raw_big_test_data/ train_big_data test_big_data big_vocab_text.txt big_vocab_tag.txt
``` ```
**(2)Classification** ### 训练
```
cd modles/contentunderstanding/tagspace
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
```
### 预测
```
# 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
```
### 训练 **(2)Classification**
### 训练
``` ```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification cd modles/contentunderstanding/classification
python -m paddlerec.run -m ./config.yaml # 自定义修改超参后,指定配置文件,使用自定义配置
``` ```
### 预测 ### 预测
``` ```
python -m paddlerec.run -m paddlerec.models.contentunderstanding.classification # 修改对应模型的config.yaml, workspace配置为当前目录的绝对路径
# 修改对应模型的config.yaml,mode配置infer_runner
# 示例: mode: train_runner -> mode: infer_runner
# infer_runner中 class配置为 class: single_infer
# 修改phase阶段为infer的配置,参照config注释
# 修改完config.yaml后 执行:
python -m paddlerec.run -m ./config.yaml
``` ```
## 效果对比 ## 效果对比
......
...@@ -12,38 +12,44 @@ ...@@ -12,38 +12,44 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
train: workspace: "paddlerec.models.contentunderstanding.tagspace"
trainer:
# for cluster training
strategy: "async"
epochs: 10 dataset:
workspace: "paddlerec.models.contentunderstanding.tagspace" - name: sample_1
type: QueueDataset
batch_size: 5
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
reader: hyper_parameters:
batch_size: 5 optimizer:
class: "{workspace}/reader.py" class: Adagrad
train_data_path: "{workspace}/train_data" learning_rate: 0.001
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
model: mode: runner1
models: "{workspace}/model.py"
hyper_parameters:
vocab_text_size: 11447
vocab_tag_size: 4
emb_dim: 10
hid_dim: 1000
win_size: 5
margin: 0.1
neg_size: 3
num_devices: 1
runner:
- name: runner1
class: single_train
epochs: 10
device: cpu
save_checkpoint_interval: 2
save_inference_interval: 4
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
save: phase:
increment: - name: phase1
dirname: "increment" model: "{workspace}/model.py"
epoch_interval: 1 dataset_name: sample_1
save_last: True thread_num: 1
inference:
dirname: "inference"
epoch_interval: 100
save_last: True
...@@ -26,26 +26,30 @@ class Model(ModelBase): ...@@ -26,26 +26,30 @@ class Model(ModelBase):
ModelBase.__init__(self, config) ModelBase.__init__(self, config)
self.cost = None self.cost = None
self.metrics = {} self.metrics = {}
self.vocab_text_size = envs.get_global_env("vocab_text_size", None, self.vocab_text_size = envs.get_global_env(
self._namespace) "hyper_parameters.vocab_text_size")
self.vocab_tag_size = envs.get_global_env("vocab_tag_size", None, self.vocab_tag_size = envs.get_global_env(
self._namespace) "hyper_parameters.vocab_tag_size")
self.emb_dim = envs.get_global_env("emb_dim", None, self._namespace) self.emb_dim = envs.get_global_env("hyper_parameters.emb_dim")
self.hid_dim = envs.get_global_env("hid_dim", None, self._namespace) self.hid_dim = envs.get_global_env("hyper_parameters.hid_dim")
self.win_size = envs.get_global_env("win_size", None, self._namespace) self.win_size = envs.get_global_env("hyper_parameters.win_size")
self.margin = envs.get_global_env("margin", None, self._namespace) self.margin = envs.get_global_env("hyper_parameters.margin")
self.neg_size = envs.get_global_env("neg_size", None, self._namespace) self.neg_size = envs.get_global_env("hyper_parameters.neg_size")
def train_net(self): def input_data(self, is_infer=False, **kwargs):
""" network"""
text = fluid.data( text = fluid.data(
name="text", shape=[None, 1], lod_level=1, dtype='int64') name="text", shape=[None, 1], lod_level=1, dtype='int64')
pos_tag = fluid.data( pos_tag = fluid.data(
name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64') name="pos_tag", shape=[None, 1], lod_level=1, dtype='int64')
neg_tag = fluid.data( neg_tag = fluid.data(
name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64') name="neg_tag", shape=[None, 1], lod_level=1, dtype='int64')
return [text, pos_tag, neg_tag]
self._data_var = [text, pos_tag, neg_tag] def net(self, input, is_infer=False):
""" network"""
text = input[0]
pos_tag = input[1]
neg_tag = input[2]
text_emb = fluid.embedding( text_emb = fluid.embedding(
input=text, input=text,
...@@ -97,22 +101,11 @@ class Model(ModelBase): ...@@ -97,22 +101,11 @@ class Model(ModelBase):
avg_cost = nn.mean(loss_part3) avg_cost = nn.mean(loss_part3)
less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
correct = nn.reduce_sum(less) correct = nn.reduce_sum(less)
self.cost = avg_cost self._cost = avg_cost
self.metrics["correct"] = correct
self.metrics["cos_pos"] = cos_pos
def get_avg_cost(self):
return self.cost
def get_metrics(self):
return self.metrics
def optimizer(self):
learning_rate = envs.get_global_env("hyper_parameters.base_lr", None,
self._namespace)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=learning_rate)
return sgd_optimizer
def infer_net(self, parameter_list): if is_infer:
self.train_net() self._infer_results["correct"] = correct
self._infer_results["cos_pos"] = cos_pos
else:
self._metrics["correct"] = correct
self._metrics["cos_pos"] = cos_pos
...@@ -63,7 +63,7 @@ def build(dirname): ...@@ -63,7 +63,7 @@ def build(dirname):
models_copy = [ models_copy = [
'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy', 'data/*.txt', 'data/*/*.txt', '*.yaml', '*.sh', 'tree/*.npy',
'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*', 'tree/*.txt', 'data/sample_data/*', 'data/sample_data/train/*',
'data/sample_data/infer/*' 'data/sample_data/infer/*', 'data/*/*.csv'
] ]
engine_copy = ['*/*.sh'] engine_copy = ['*/*.sh']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册