change data

a67b25e5 · root · ef70b62a · a67b25e5 · ef70b62a · a67b25e5
5 changed file
--- a/fluid/sequence_tagging_for_ner/README.md
+++ b/fluid/sequence_tagging_for_ner/README.md
@@ -97,62 +97,79 @@ Baghdad      NNP  I-NP  I-LOC

    ```python
    main(
-          train_data_file="data/train",
-          test_data_file="data/test",
-          vocab_file="data/vocab.txt",
-          target_file="data/target.txt",
-          emb_file="data/wordVectors.txt",
-          model_save_dir="models/")
+        train_data_file="data/train",
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        emb_file="data/wordVectors.txt",
+        model_save_dir="models",
+        num_passes=1000,
+        use_gpu=False,
+        parallel=True)
    ```

 3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**

    ```text
-    commandline:  --use_gpu=False --trainer_count=1
-    Initing parameters..
-    Init parameters done.
-    Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
-    Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
+    Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
+    Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
+    Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
+    Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
+    Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
+    Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
+    Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
+    Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
+    Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
+    Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
+    Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
+    Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
+    Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
+    Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
+    Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
+    [TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
+    [TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
    ```
 ### 预测
-1. 修改 [infer.py](./infer.py) 的 `main` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
+1. 修改 [infer.py](./infer.py) 的 `infer` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：

    ```python
    infer(
        model_path="models/params_pass_0",
-        batch_size=2,
+        batch_size=6,
        test_data_file="data/test",
        vocab_file="data/vocab.txt",
-        target_file="data/target.txt")
+        target_file="data/target.txt",
+        use_gpu=False
+    )
    ```

 2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练70个pass所得模型的部分预测结果）：

 ```
-leicestershire  B-ORG   B-LOC
-extended        O       O
-their   O       O
-first   O       O
-innings O       O
-by      O       O
-DGDG    O       O
-runs    O       O
-before  O       O
-being   O       O
-bowled  O       O
-out     O       O
-for     O       O
-296     O       O
-with    O       O
-england B-LOC   B-LOC
-discard O       O
-andy    B-PER   B-PER
-caddick I-PER   I-PER
-taking  O       O
-three   O       O
-for     O       O
-DGDG    O       O
-.       O       O
+leicestershire    B-ORG    B-LOC
+extended    O    O
+their    O    O
+first    O    O
+innings    O    O
+by    O    O
+DGDG    O    O
+runs    O    O
+before    O    O
+being    O    O
+bowled    O    O
+out    O    O
+for    O    O
+296    O    O
+with    O    O
+england    B-LOC    B-LOC
+discard    O    O
+andy    B-PER    B-PER
+caddick    I-PER    I-PER
+taking    O    O
+three    O    O
+for    O    O
+DGDG    O    O
+.    O    O
 ```

    输出分为三列，以“\t” 分隔，第一列是输入的词语，第二列是标准结果，第三列为生成的标记结果。多条输入序列之间以空行分隔。

--- a/fluid/sequence_tagging_for_ner/data/vocab.txt
+++ b/fluid/sequence_tagging_for_ner/data/vocab.txt
--- a/fluid/sequence_tagging_for_ner/infer.py
+++ b/fluid/sequence_tagging_for_ner/infer.py
-import gzip
 import numpy as np
-import reader
 import paddle.fluid as fluid
 import paddle.v2 as paddle
-from network_conf import ner_net
-from utils import load_dict, load_reverse_dict

+from network_conf import ner_net
+import reader
+from utils import load_dict, load_reverse_dict, to_lodtensor

-def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
-    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
-    mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1)
-    target = fluid.layers.data(
-        name='target', shape=[1], dtype='int64', lod_level=1)

+def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
+          use_gpu):
    word_dict = load_dict(vocab_file)
    word_reverse_dict = load_reverse_dict(vocab_file)

@@ -22,8 +18,7 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
    test_data = paddle.batch(
        reader.data_reader(test_data_file, word_dict, label_dict),
        batch_size=batch_size)
-    place = fluid.CPUPlace()
-    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()
@@ -31,10 +26,16 @@ def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
        [inference_program, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
        for data in test_data():
-            crf_decode = exe.run(inference_program,
-                                 feed=feeder.feed(data),
-                                 fetch_list=fetch_targets,
-                                 return_numpy=False)
+            word = to_lodtensor(map(lambda x: x[0], data), place)
+            mark = to_lodtensor(map(lambda x: x[1], data), place)
+            target = to_lodtensor(map(lambda x: x[2], data), place)
+            crf_decode = exe.run(
+                inference_program,
+                feed={"word": word,
+                      "mark": mark,
+                      "target": target},
+                fetch_list=fetch_targets,
+                return_numpy=False)
            lod_info = (crf_decode[0].lod())[0]
            np_data = np.array(crf_decode[0])
            assert len(data) == len(lod_info) - 1
@@ -59,4 +60,5 @@ if __name__ == "__main__":
        batch_size=6,
        test_data_file="data/test",
        vocab_file="data/vocab.txt",
-        target_file="data/target.txt")
+        target_file="data/target.txt",
+        use_gpu=False)
--- a/fluid/sequence_tagging_for_ner/train.py
+++ b/fluid/sequence_tagging_for_ner/train.py
@@ -7,22 +7,7 @@ import paddle.fluid as fluid

 import reader
 from network_conf import ner_net
-from utils import logger, load_dict, get_embedding
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
+from utils import logger, load_dict, get_embedding, to_lodtensor


 def test(exe, chunk_evaluator, inference_program, test_data, place):
@@ -84,7 +69,6 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
        batch_size=BATCH_SIZE)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-    #place = fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
    exe = fluid.Executor(place)

@@ -94,6 +78,8 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
    embedding_param.set(word_vector_values, place)

+    print fluid.default_main_program()
+
    batch_id = 0
    for pass_id in xrange(num_passes):
        chunk_evaluator.reset(exe)
@@ -103,20 +89,21 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
                feed=feeder.feed(data),
                fetch_list=[avg_cost] + chunk_evaluator.metrics)
            if batch_id % 5 == 0:
-                print("Pass " + str(pass_id) + ", Batch " + str(
-                    batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(
-                        batch_precision[0]) + ", Recall " + str(batch_recall[0])
-                      + ", F1_score" + str(batch_f1_score[0]))
+                print(
+                    "Pass " + str(pass_id) + ", Batch " + str(batch_id) +
+                    ", Cost " + str(cost[0]) + ", Precision " +
+                    str(batch_precision[0]) + ", Recall " + str(batch_recall[0])
+                    + ", F1_score" + str(batch_f1_score[0]))
            batch_id = batch_id + 1

        pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
-        print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
-            pass_precision) + " pass_recall:" + str(pass_recall) +
+        print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
+              str(pass_precision) + " pass_recall:" + str(pass_recall) +
              " pass_f1_score:" + str(pass_f1_score))
        pass_precision, pass_recall, pass_f1_score = test(
            exe, chunk_evaluator, inference_program, test_reader, place)
-        print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
-            pass_precision) + " pass_recall:" + str(pass_recall) +
+        print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
+              str(pass_precision) + " pass_recall:" + str(pass_recall) +
              " pass_f1_score:" + str(pass_f1_score))

        save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
@@ -134,4 +121,4 @@ if __name__ == "__main__":
        model_save_dir="models",
        num_passes=1000,
        use_gpu=False,
-        parallel=True)
+        parallel=False)
--- a/fluid/sequence_tagging_for_ner/utils.py
+++ b/fluid/sequence_tagging_for_ner/utils.py
@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 import logging

+import paddle.fluid as fluid
+
 import numpy as np

 logger = logging.getLogger("paddle")
@@ -42,3 +44,18 @@ def load_reverse_dict(dict_path):
    """
    return dict((idx, line.strip().split("\t")[0])
                for idx, line in enumerate(open(dict_path, "r").readlines()))
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res