提交 a67b25e5 编写于 作者: R root

change data

上级 ef70b62a
...@@ -97,62 +97,79 @@ Baghdad NNP I-NP I-LOC ...@@ -97,62 +97,79 @@ Baghdad NNP I-NP I-LOC
```python ```python
main( main(
train_data_file="data/train", train_data_file="data/train",
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt", target_file="data/target.txt",
emb_file="data/wordVectors.txt", emb_file="data/wordVectors.txt",
model_save_dir="models/") model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=True)
``` ```
3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。** 3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。**
```text ```text
commandline: --use_gpu=False --trainer_count=1 Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
Initing parameters.. Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
Init parameters done. Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738} Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756} Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
[TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
[TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
``` ```
### 预测 ### 预测
1. 修改 [infer.py](./infer.py)`main` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下: 1. 修改 [infer.py](./infer.py)`infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下:
```python ```python
infer( infer(
model_path="models/params_pass_0", model_path="models/params_pass_0",
batch_size=2, batch_size=6,
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt") target_file="data/target.txt",
use_gpu=False
)
``` ```
2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果): 2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果):
``` ```
leicestershire B-ORG B-LOC leicestershire B-ORG B-LOC
extended O O extended O O
their O O their O O
first O O first O O
innings O O innings O O
by O O by O O
DGDG O O DGDG O O
runs O O runs O O
before O O before O O
being O O being O O
bowled O O bowled O O
out O O out O O
for O O for O O
296 O O 296 O O
with O O with O O
england B-LOC B-LOC england B-LOC B-LOC
discard O O discard O O
andy B-PER B-PER andy B-PER B-PER
caddick I-PER I-PER caddick I-PER I-PER
taking O O taking O O
three O O three O O
for O O for O O
DGDG O O DGDG O O
. O O . O O
``` ```
输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。 输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。
......
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import gzip
import numpy as np import numpy as np
import reader
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
from network_conf import ner_net
from utils import load_dict, load_reverse_dict
from network_conf import ner_net
import reader
from utils import load_dict, load_reverse_dict, to_lodtensor
def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
use_gpu):
word_dict = load_dict(vocab_file) word_dict = load_dict(vocab_file)
word_reverse_dict = load_reverse_dict(vocab_file) word_reverse_dict = load_reverse_dict(vocab_file)
...@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file): ...@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
test_data = paddle.batch( test_data = paddle.batch(
reader.data_reader(test_data_file, word_dict, label_dict), reader.data_reader(test_data_file, word_dict, label_dict),
batch_size=batch_size) batch_size=batch_size)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
inference_scope = fluid.core.Scope() inference_scope = fluid.core.Scope()
...@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file): ...@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe) fetch_targets] = fluid.io.load_inference_model(model_path, exe)
for data in test_data(): for data in test_data():
crf_decode = exe.run(inference_program, word = to_lodtensor(map(lambda x: x[0], data), place)
feed=feeder.feed(data), mark = to_lodtensor(map(lambda x: x[1], data), place)
fetch_list=fetch_targets, target = to_lodtensor(map(lambda x: x[2], data), place)
return_numpy=False) crf_decode = exe.run(
inference_program,
feed={"word": word,
"mark": mark,
"target": target},
fetch_list=fetch_targets,
return_numpy=False)
lod_info = (crf_decode[0].lod())[0] lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0]) np_data = np.array(crf_decode[0])
assert len(data) == len(lod_info) - 1 assert len(data) == len(lod_info) - 1
...@@ -59,4 +60,5 @@ if __name__ == "__main__": ...@@ -59,4 +60,5 @@ if __name__ == "__main__":
batch_size=6, batch_size=6,
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt") target_file="data/target.txt",
use_gpu=False)
...@@ -7,22 +7,7 @@ import paddle.fluid as fluid ...@@ -7,22 +7,7 @@ import paddle.fluid as fluid
import reader import reader
from network_conf import ner_net from network_conf import ner_net
from utils import logger, load_dict, get_embedding from utils import logger, load_dict, get_embedding, to_lodtensor
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def test(exe, chunk_evaluator, inference_program, test_data, place): def test(exe, chunk_evaluator, inference_program, test_data, place):
...@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
#place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor() embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(word_vector_values, place) embedding_param.set(word_vector_values, place)
print fluid.default_main_program()
batch_id = 0 batch_id = 0
for pass_id in xrange(num_passes): for pass_id in xrange(num_passes):
chunk_evaluator.reset(exe) chunk_evaluator.reset(exe)
...@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics) fetch_list=[avg_cost] + chunk_evaluator.metrics)
if batch_id % 5 == 0: if batch_id % 5 == 0:
print("Pass " + str(pass_id) + ", Batch " + str( print(
batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str( "Pass " + str(pass_id) + ", Batch " + str(batch_id) +
batch_precision[0]) + ", Recall " + str(batch_recall[0]) ", Cost " + str(cost[0]) + ", Precision " +
+ ", F1_score" + str(batch_f1_score[0])) str(batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
batch_id = batch_id + 1 batch_id = batch_id + 1
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str( print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
pass_precision) + " pass_recall:" + str(pass_recall) + str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score)) " pass_f1_score:" + str(pass_f1_score))
pass_precision, pass_recall, pass_f1_score = test( pass_precision, pass_recall, pass_f1_score = test(
exe, chunk_evaluator, inference_program, test_reader, place) exe, chunk_evaluator, inference_program, test_reader, place)
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str( print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
pass_precision) + " pass_recall:" + str(pass_recall) + str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score)) " pass_f1_score:" + str(pass_f1_score))
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
...@@ -134,4 +121,4 @@ if __name__ == "__main__": ...@@ -134,4 +121,4 @@ if __name__ == "__main__":
model_save_dir="models", model_save_dir="models",
num_passes=1000, num_passes=1000,
use_gpu=False, use_gpu=False,
parallel=True) parallel=False)
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging import logging
import paddle.fluid as fluid
import numpy as np import numpy as np
logger = logging.getLogger("paddle") logger = logging.getLogger("paddle")
...@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path): ...@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path):
""" """
return dict((idx, line.strip().split("\t")[0]) return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines())) for idx, line in enumerate(open(dict_path, "r").readlines()))
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册