提交 a67b25e5 编写于 作者: R root

change data

上级 ef70b62a
......@@ -97,62 +97,79 @@ Baghdad NNP I-NP I-LOC
```python
main(
train_data_file="data/train",
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
emb_file="data/wordVectors.txt",
model_save_dir="models/")
train_data_file="data/train",
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
emb_file="data/wordVectors.txt",
model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=True)
```
3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。**
```text
commandline: --use_gpu=False --trainer_count=1
Initing parameters..
Init parameters done.
Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
[TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
[TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
```
### 预测
1. 修改 [infer.py](./infer.py)`main` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下:
1. 修改 [infer.py](./infer.py)`infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下:
```python
infer(
model_path="models/params_pass_0",
batch_size=2,
batch_size=6,
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt")
target_file="data/target.txt",
use_gpu=False
)
```
2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果):
```
leicestershire B-ORG B-LOC
extended O O
their O O
first O O
innings O O
by O O
DGDG O O
runs O O
before O O
being O O
bowled O O
out O O
for O O
296 O O
with O O
england B-LOC B-LOC
discard O O
andy B-PER B-PER
caddick I-PER I-PER
taking O O
three O O
for O O
DGDG O O
. O O
leicestershire B-ORG B-LOC
extended O O
their O O
first O O
innings O O
by O O
DGDG O O
runs O O
before O O
being O O
bowled O O
out O O
for O O
296 O O
with O O
england B-LOC B-LOC
discard O O
andy B-PER B-PER
caddick I-PER I-PER
taking O O
three O O
for O O
DGDG O O
. O O
```
输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。
......
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import gzip
import numpy as np
import reader
import paddle.fluid as fluid
import paddle.v2 as paddle
from network_conf import ner_net
from utils import load_dict, load_reverse_dict
from network_conf import ner_net
import reader
from utils import load_dict, load_reverse_dict, to_lodtensor
def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
use_gpu):
word_dict = load_dict(vocab_file)
word_reverse_dict = load_reverse_dict(vocab_file)
......@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
test_data = paddle.batch(
reader.data_reader(test_data_file, word_dict, label_dict),
batch_size=batch_size)
place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
......@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
for data in test_data():
crf_decode = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=fetch_targets,
return_numpy=False)
word = to_lodtensor(map(lambda x: x[0], data), place)
mark = to_lodtensor(map(lambda x: x[1], data), place)
target = to_lodtensor(map(lambda x: x[2], data), place)
crf_decode = exe.run(
inference_program,
feed={"word": word,
"mark": mark,
"target": target},
fetch_list=fetch_targets,
return_numpy=False)
lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0])
assert len(data) == len(lod_info) - 1
......@@ -59,4 +60,5 @@ if __name__ == "__main__":
batch_size=6,
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt")
target_file="data/target.txt",
use_gpu=False)
......@@ -7,22 +7,7 @@ import paddle.fluid as fluid
import reader
from network_conf import ner_net
from utils import logger, load_dict, get_embedding
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
from utils import logger, load_dict, get_embedding, to_lodtensor
def test(exe, chunk_evaluator, inference_program, test_data, place):
......@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
#place = fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place)
......@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(word_vector_values, place)
print fluid.default_main_program()
batch_id = 0
for pass_id in xrange(num_passes):
chunk_evaluator.reset(exe)
......@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics)
if batch_id % 5 == 0:
print("Pass " + str(pass_id) + ", Batch " + str(
batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(
batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
print(
"Pass " + str(pass_id) + ", Batch " + str(batch_id) +
", Cost " + str(cost[0]) + ", Precision " +
str(batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
batch_id = batch_id + 1
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
pass_precision, pass_recall, pass_f1_score = test(
exe, chunk_evaluator, inference_program, test_reader, place)
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
str(pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
......@@ -134,4 +121,4 @@ if __name__ == "__main__":
model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=True)
parallel=False)
......@@ -2,6 +2,8 @@
# -*- coding: utf-8 -*-
import logging
import paddle.fluid as fluid
import numpy as np
logger = logging.getLogger("paddle")
......@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path):
"""
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册