提交 a67b25e5 编写于 作者: R root

change data

上级 ef70b62a
...@@ -97,62 +97,79 @@ Baghdad NNP I-NP I-LOC ...@@ -97,62 +97,79 @@ Baghdad NNP I-NP I-LOC
```python ```python
main( main(
train_data_file="data/train", train_data_file="data/train",
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt", target_file="data/target.txt",
emb_file="data/wordVectors.txt", emb_file="data/wordVectors.txt",
model_save_dir="models/") model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=True)
``` ```
3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。** 3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。**
```text ```text
commandline: --use_gpu=False --trainer_count=1 Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
Initing parameters.. Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
Init parameters done. Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738} Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756} Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
[TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
[TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
``` ```
### 预测 ### 预测
1. 修改 [infer.py](./infer.py)`main` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下: 1. 修改 [infer.py](./infer.py)`infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下:
```python ```python
infer( infer(
model_path="models/params_pass_0", model_path="models/params_pass_0",
batch_size=2, batch_size=6,
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt") target_file="data/target.txt",
use_gpu=False
)
``` ```
2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果): 2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果):
``` ```
leicestershire B-ORG B-LOC leicestershire B-ORG B-LOC
extended O O extended O O
their O O their O O
first O O first O O
innings O O innings O O
by O O by O O
DGDG O O DGDG O O
runs O O runs O O
before O O before O O
being O O being O O
bowled O O bowled O O
out O O out O O
for O O for O O
296 O O 296 O O
with O O with O O
england B-LOC B-LOC england B-LOC B-LOC
discard O O discard O O
andy B-PER B-PER andy B-PER B-PER
caddick I-PER I-PER caddick I-PER I-PER
taking O O taking O O
three O O three O O
for O O for O O
DGDG O O DGDG O O
. O O . O O
``` ```
输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。 输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。
......
此差异已折叠。
import gzip
import numpy as np import numpy as np
import reader
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.v2 as paddle import paddle.v2 as paddle
from network_conf import ner_net
from utils import load_dict, load_reverse_dict
from network_conf import ner_net
import reader
from utils import load_dict, load_reverse_dict, to_lodtensor
def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
use_gpu):
word_dict = load_dict(vocab_file) word_dict = load_dict(vocab_file)
word_reverse_dict = load_reverse_dict(vocab_file) word_reverse_dict = load_reverse_dict(vocab_file)
...@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file): ...@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
test_data = paddle.batch( test_data = paddle.batch(
reader.data_reader(test_data_file, word_dict, label_dict), reader.data_reader(test_data_file, word_dict, label_dict),
batch_size=batch_size) batch_size=batch_size)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
inference_scope = fluid.core.Scope() inference_scope = fluid.core.Scope()
...@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file): ...@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe) fetch_targets] = fluid.io.load_inference_model(model_path, exe)
for data in test_data(): for data in test_data():
crf_decode = exe.run(inference_program, word = to_lodtensor(map(lambda x: x[0], data), place)
feed=feeder.feed(data), mark = to_lodtensor(map(lambda x: x[1], data), place)
fetch_list=fetch_targets, target = to_lodtensor(map(lambda x: x[2], data), place)
return_numpy=False) crf_decode = exe.run(
inference_program,
feed={"word": word,
"mark": mark,
"target": target},
fetch_list=fetch_targets,
return_numpy=False)
lod_info = (crf_decode[0].lod())[0] lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0]) np_data = np.array(crf_decode[0])
assert len(data) == len(lod_info) - 1 assert len(data) == len(lod_info) - 1
...@@ -59,4 +60,5 @@ if __name__ == "__main__": ...@@ -59,4 +60,5 @@ if __name__ == "__main__":
batch_size=6, batch_size=6,
test_data_file="data/test", test_data_file="data/test",
vocab_file="data/vocab.txt", vocab_file="data/vocab.txt",
target_file="data/target.txt") target_file="data/target.txt",
use_gpu=False)
...@@ -7,22 +7,7 @@ import paddle.fluid as fluid ...@@ -7,22 +7,7 @@ import paddle.fluid as fluid
import reader import reader
from network_conf import ner_net from network_conf import ner_net
from utils import logger, load_dict, get_embedding from utils import logger, load_dict, get_embedding, to_lodtensor
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def test(exe, chunk_evaluator, inference_program, test_data, place): def test(exe, chunk_evaluator, inference_program, test_data, place):
...@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
#place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor() embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(word_vector_values, place) embedding_param.set(word_vector_values, place)
print fluid.default_main_program()
batch_id = 0 batch_id = 0
for pass_id in xrange(num_passes): for pass_id in xrange(num_passes):
chunk_evaluator.reset(exe) chunk_evaluator.reset(exe)
...@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, ...@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics) fetch_list=[avg_cost] + chunk_evaluator.metrics)
if batch_id % 5 == 0: if batch_id % 5 == 0:
print("Pass " + str(pass_id) + ", Batch " + str( print(
batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str( "Pass " + str(pass_id) + ", Batch " + str(batch_id) +
batch_precision[0]) + ", Recall " + str(batch_recall[0]) ", Cost " + str(cost[0]) + ", Precision " +
+ ", F1_score" + str(batch_f1_score[0])) str(batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
batch_id = batch_id + 1 batch_id = batch_id + 1
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str( print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
pass_precision) + " pass_recall:" + str(pass_recall) + str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score)) " pass_f1_score:" + str(pass_f1_score))
pass_precision, pass_recall, pass_f1_score = test( pass_precision, pass_recall, pass_f1_score = test(
exe, chunk_evaluator, inference_program, test_reader, place) exe, chunk_evaluator, inference_program, test_reader, place)
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str( print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
pass_precision) + " pass_recall:" + str(pass_recall) + str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score)) " pass_f1_score:" + str(pass_f1_score))
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
...@@ -134,4 +121,4 @@ if __name__ == "__main__": ...@@ -134,4 +121,4 @@ if __name__ == "__main__":
model_save_dir="models", model_save_dir="models",
num_passes=1000, num_passes=1000,
use_gpu=False, use_gpu=False,
parallel=True) parallel=False)
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging import logging
import paddle.fluid as fluid
import numpy as np import numpy as np
logger = logging.getLogger("paddle") logger = logging.getLogger("paddle")
...@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path): ...@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path):
""" """
return dict((idx, line.strip().split("\t")[0]) return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines())) for idx, line in enumerate(open(dict_path, "r").readlines()))
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册