Merge pull request #1121 from kuke/sequence_tagging_for_ner_ce

Enable ce for sequence_tagging_for_ner

Merge pull request #1121 from kuke/sequence_tagging_for_ner_ce
Enable ce for sequence_tagging_for_ner
8f61de12 · Yibing Liu · GitHub · 5efb3d3d · c443f9b8 · 8f61de12
10 changed file
--- a/fluid/sequence_tagging_for_ner/.run_ce.sh
+++ b/fluid/sequence_tagging_for_ner/.run_ce.sh
+###!/bin/bash
+####This file is only used for continuous evaluation.
+
+export CE_MODE_X=1
+python train.py  | python _ce.py
--- a/fluid/sequence_tagging_for_ner/README.md
+++ b/fluid/sequence_tagging_for_ner/README.md
@@ -22,11 +22,7 @@

 ## 数据获取

-请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式，将该例中的data文件夹拷贝至本例目录下，运行其中的download.sh脚本获取训练和测试数据。
-
-## 通用脚本获取
-
-请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。
+完整数据的获取请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中的方式。本例的示例数据同样可以通过运行data/download.sh来获取。

 ## 训练


--- a/fluid/sequence_tagging_for_ner/_ce.py
+++ b/fluid/sequence_tagging_for_ner/_ce.py
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_acc_kpi = AccKpi('train_precision', 0.005, actived=True)
+test_acc_kpi = CostKpi('test_precision', 0.005, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
+
+tracking_kpis = [
+    train_acc_kpi,
+    test_acc_kpi,
+    train_duration_kpi,
+]
+
+
+def parse_log(log):
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/fluid/sequence_tagging_for_ner/data/download.sh
+++ b/fluid/sequence_tagging_for_ner/data/download.sh
+if [ -f assignment2.zip ]; then
+    echo "data exist"
+    exit 0
+else
+    wget http://cs224d.stanford.edu/assignment2/assignment2.zip
+fi
+
+if [ $? -eq 0  ];then
+    unzip assignment2.zip
+    cp assignment2_release/data/ner/wordVectors.txt ./data
+    cp assignment2_release/data/ner/vocab.txt ./data
+    rm -rf assignment2_release
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi
+
--- a/fluid/sequence_tagging_for_ner/data/target.txt
+++ b/fluid/sequence_tagging_for_ner/data/target.txt
+B-LOC
+I-LOC
+B-MISC
+I-MISC
+B-ORG
+I-ORG
+B-PER
+I-PER
+O
--- a/fluid/sequence_tagging_for_ner/data/test
+++ b/fluid/sequence_tagging_for_ner/data/test
+CRICKET NNP I-NP O
+- : O O
+LEICESTERSHIRE NNP I-NP I-ORG
+TAKE NNP I-NP O
+OVER IN I-PP O
+AT NNP I-NP O
+TOP NNP I-NP O
+AFTER NNP I-NP O
+INNINGS NNP I-NP O
+VICTORY NN I-NP O
+. . O O
+
+LONDON NNP I-NP I-LOC
+1996-08-30 CD I-NP O
+
+West NNP I-NP I-MISC
+Indian NNP I-NP I-MISC
+all-rounder NN I-NP O
+Phil NNP I-NP I-PER
+Simmons NNP I-NP I-PER
+took VBD I-VP O
+four CD I-NP O
+for IN I-PP O
+38 CD I-NP O
+on IN I-PP O
+Friday NNP I-NP O
+as IN I-PP O
+Leicestershire NNP I-NP I-ORG
+beat VBD I-VP O
+Somerset NNP I-NP I-ORG
+by IN I-PP O
+an DT I-NP O
+innings NN I-NP O
+and CC O O
+39 CD I-NP O
+runs NNS I-NP O
+in IN I-PP O
+two CD I-NP O
+days NNS I-NP O
+to TO I-VP O
+take VB I-VP O
+over IN I-PP O
+at IN B-PP O
+the DT I-NP O
+head NN I-NP O
+of IN I-PP O
+the DT I-NP O
+county NN I-NP O
+championship NN I-NP O
+. . O O
+
+Their PRP$ I-NP O
+stay NN I-NP O
+on IN I-PP O
+top NN I-NP O
+, , O O
+though RB I-ADVP O
+, , O O
+may MD I-VP O
+be VB I-VP O
+short-lived JJ I-ADJP O
+as IN I-PP O
+title NN I-NP O
+rivals NNS I-NP O
+Essex NNP I-NP I-ORG
+, , O O
+Derbyshire NNP I-NP I-ORG
+and CC I-NP O
+Surrey NNP I-NP I-ORG
+all DT O O
+closed VBD I-VP O
+in RP I-PRT O
+on IN I-PP O
+victory NN I-NP O
+while IN I-SBAR O
+Kent NNP I-NP I-ORG
+made VBD I-VP O
+up RP I-PRT O
+for IN I-PP O
+lost VBN I-NP O
+time NN I-NP O
+in IN I-PP O
+their PRP$ I-NP O
+rain-affected JJ I-NP O
+match NN I-NP O
+against IN I-PP O
+Nottinghamshire NNP I-NP I-ORG
+. . O O
+
+After IN I-PP O
+bowling VBG I-NP O
+Somerset NNP I-NP I-ORG
+out RP I-PRT O
+for IN I-PP O
+83 CD I-NP O
+on IN I-PP O
+the DT I-NP O
+opening NN I-NP O
+morning NN I-NP O
+at IN I-PP O
+Grace NNP I-NP I-LOC
+Road NNP I-NP I-LOC
+, , O O
+Leicestershire NNP I-NP I-ORG
+extended VBD I-VP O
+their PRP$ I-NP O
+first JJ I-NP O
+innings NN I-NP O
+by IN I-PP O
+94 CD I-NP O
+runs VBZ I-VP O
+before IN I-PP O
+being VBG I-VP O
+bowled VBD I-VP O
+out RP I-PRT O
+for IN I-PP O
+296 CD I-NP O
+with IN I-PP O
+England NNP I-NP I-LOC
+discard VBP I-VP O
+Andy NNP I-NP I-PER
+Caddick NNP I-NP I-PER
+taking VBG I-VP O
+three CD I-NP O
+for IN I-PP O
+83 CD I-NP O
+. . O O
+
--- a/fluid/sequence_tagging_for_ner/data/train
+++ b/fluid/sequence_tagging_for_ner/data/train
+EU NNP I-NP I-ORG
+rejects VBZ I-VP O
+German JJ I-NP I-MISC
+call NN I-NP O
+to TO I-VP O
+boycott VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+. . O O
+
+Peter NNP I-NP I-PER
+Blackburn NNP I-NP I-PER
+
+BRUSSELS NNP I-NP I-LOC
+1996-08-22 CD I-NP O
+
+The DT I-NP O
+European NNP I-NP I-ORG
+Commission NNP I-NP I-ORG
+said VBD I-VP O
+on IN I-PP O
+Thursday NNP I-NP O
+it PRP B-NP O
+disagreed VBD I-VP O
+with IN I-PP O
+German JJ I-NP I-MISC
+advice NN I-NP O
+to TO I-PP O
+consumers NNS I-NP O
+to TO I-VP O
+shun VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+until IN I-SBAR O
+scientists NNS I-NP O
+determine VBP I-VP O
+whether IN I-SBAR O
+mad JJ I-NP O
+cow NN I-NP O
+disease NN I-NP O
+can MD I-VP O
+be VB I-VP O
+transmitted VBN I-VP O
+to TO I-PP O
+sheep NN I-NP O
+. . O O
+
+Germany NNP I-NP I-LOC
+'s POS B-NP O
+representative NN I-NP O
+to TO I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+'s POS B-NP O
+veterinary JJ I-NP O
+committee NN I-NP O
+Werner NNP I-NP I-PER
+Zwingmann NNP I-NP I-PER
+said VBD I-VP O
+on IN I-PP O
+Wednesday NNP I-NP O
+consumers NNS I-NP O
+should MD I-VP O
+buy VB I-VP O
+sheepmeat NN I-NP O
+from IN I-PP O
+countries NNS I-NP O
+other JJ I-ADJP O
+than IN I-PP O
+Britain NNP I-NP I-LOC
+until IN I-SBAR O
+the DT I-NP O
+scientific JJ I-NP O
+advice NN I-NP O
+was VBD I-VP O
+clearer JJR I-ADJP O
+. . O O
+
+" " O O
+We PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+support VB I-VP O
+any DT I-NP O
+such JJ I-NP O
+recommendation NN I-NP O
+because IN I-SBAR O
+we PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+see VB I-VP O
+any DT I-NP O
+grounds NNS I-NP O
+for IN I-PP O
+it PRP I-NP O
+, , O O
+" " O O
+the DT I-NP O
+Commission NNP I-NP I-ORG
+'s POS B-NP O
+chief JJ I-NP O
+spokesman NN I-NP O
+Nikolaus NNP I-NP I-PER
+van NNP I-NP I-PER
+der FW I-NP I-PER
+Pas NNP I-NP I-PER
+told VBD I-VP O
+a DT I-NP O
+news NN I-NP O
+briefing NN I-NP O
+. . O O
+
+He PRP I-NP O
+said VBD I-VP O
+further JJ I-NP O
+scientific JJ I-NP O
+study NN I-NP O
+was VBD I-VP O
+required VBN I-VP O
+and CC O O
+if IN I-SBAR O
+it PRP I-NP O
+was VBD I-VP O
+found VBN I-VP O
+that IN I-SBAR O
+action NN I-NP O
+was VBD I-VP O
+needed VBN I-VP O
+it PRP I-NP O
+should MD I-VP O
+be VB I-VP O
+taken VBN I-VP O
+by IN I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+. . O O
+
--- a/fluid/sequence_tagging_for_ner/reader.py
+++ b/fluid/sequence_tagging_for_ner/reader.py
+"""
+Conll03 dataset.
+"""
+
+from utils import *
+
+__all__ = ["data_reader"]
+
+
+def canonicalize_digits(word):
+    if any([c.isalpha() for c in word]): return word
+    word = re.sub("\d", "DG", word)
+    if word.startswith("DG"):
+        word = word.replace(",", "")  # remove thousands separator
+    return word
+
+
+def canonicalize_word(word, wordset=None, digits=True):
+    word = word.lower()
+    if digits:
+        if (wordset != None) and (word in wordset): return word
+        word = canonicalize_digits(word)  # try to canonicalize numbers
+    if (wordset == None) or (word in wordset): return word
+    else: return "UUUNKKK"  # unknown token
+
+
+def data_reader(data_file, word_dict, label_dict):
+    """
+    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
+    It returns a reader creator, each sample in the reader includes:
+    word id sequence, label id sequence and raw sentence.
+
+    :return: reader creator
+    :rtype: callable
+    """
+
+    def reader():
+        UNK_IDX = word_dict["UUUNKKK"]
+
+        sentence = []
+        labels = []
+        with open(data_file, "r") as f:
+            for line in f:
+                if len(line.strip()) == 0:
+                    if len(sentence) > 0:
+                        word_idx = [
+                            word_dict.get(
+                                canonicalize_word(w, word_dict), UNK_IDX)
+                            for w in sentence
+                        ]
+                        mark = [1 if w[0].isupper() else 0 for w in sentence]
+                        label_idx = [label_dict[l] for l in labels]
+                        yield word_idx, mark, label_idx
+                    sentence = []
+                    labels = []
+                else:
+                    segs = line.strip().split()
+                    sentence.append(segs[0])
+                    # transform I-TYPE to BIO schema
+                    if segs[-1] != "O" and (len(labels) == 0 or
+                                            labels[-1][1:] != segs[-1][1:]):
+                        labels.append("B" + segs[-1][1:])
+                    else:
+                        labels.append(segs[-1])
+
+    return reader
--- a/fluid/sequence_tagging_for_ner/train.py
+++ b/fluid/sequence_tagging_for_ner/train.py
 import os
 import math
+import time
 import numpy as np

-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid

 import reader
@@ -24,12 +25,19 @@ def test(exe, chunk_evaluator, inference_program, test_data, place):
    return chunk_evaluator.eval(exe)


-def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
-         model_save_dir, num_passes, use_gpu, parallel):
+def main(train_data_file,
+         test_data_file,
+         vocab_file,
+         target_file,
+         emb_file,
+         model_save_dir,
+         num_passes,
+         use_gpu,
+         parallel,
+         batch_size=200):
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

-    BATCH_SIZE = 200
    word_dict = load_dict(vocab_file)
    label_dict = load_dict(target_file)

@@ -58,55 +66,71 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
        test_target = chunk_evaluator.metrics + chunk_evaluator.states
        inference_program = fluid.io.get_inference_program(test_target)

+    if "CE_MODE_X" not in os.environ:
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                reader.data_reader(train_data_file, word_dict, label_dict),
                buf_size=20000),
-        batch_size=BATCH_SIZE)
+            batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.reader.shuffle(
                reader.data_reader(test_data_file, word_dict, label_dict),
                buf_size=20000),
-        batch_size=BATCH_SIZE)
+            batch_size=batch_size)
+    else:
+        train_reader = paddle.batch(
+            reader.data_reader(train_data_file, word_dict, label_dict),
+            batch_size=batch_size)
+        test_reader = paddle.batch(
+            reader.data_reader(test_data_file, word_dict, label_dict),
+            batch_size=batch_size)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
    exe = fluid.Executor(place)

+    if "CE_MODE_X" in os.environ:
+        fluid.default_startup_program().random_seed = 110
    exe.run(fluid.default_startup_program())

    embedding_name = 'emb'
    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
    embedding_param.set(word_vector_values, place)

-    batch_id = 0
    for pass_id in xrange(num_passes):
        chunk_evaluator.reset(exe)
-        for data in train_reader():
+        for batch_id, data in enumerate(train_reader()):
            cost, batch_precision, batch_recall, batch_f1_score = exe.run(
                fluid.default_main_program(),
                feed=feeder.feed(data),
                fetch_list=[avg_cost] + chunk_evaluator.metrics)
            if batch_id % 5 == 0:
+                print(cost)
                print("Pass " + str(pass_id) + ", Batch " + str(
                    batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(
                        batch_precision[0]) + ", Recall " + str(batch_recall[0])
                      + ", F1_score" + str(batch_f1_score[0]))
-            batch_id = batch_id + 1

        pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
        print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
            pass_precision) + " pass_recall:" + str(pass_recall) +
              " pass_f1_score:" + str(pass_f1_score))
-        pass_precision, pass_recall, pass_f1_score = test(
+        test_pass_precision, test_pass_recall, test_pass_f1_score = test(
            exe, chunk_evaluator, inference_program, test_reader, place)
        print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
-            pass_precision) + " pass_recall:" + str(pass_recall) +
-              " pass_f1_score:" + str(pass_f1_score))
+            test_pass_precision) + " pass_recall:" + str(test_pass_recall) +
+              " pass_f1_score:" + str(test_pass_f1_score))

        save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
        fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
-                                      [crf_decode], exe)
+                                      crf_decode, exe)
+
+        if ("CE_MODE_X" in os.environ) and (pass_id % 50 == 0):
+            if pass_id > 0:
+                print("kpis	train_precision	%f" % pass_precision)
+                print("kpis	test_precision	%f" % test_pass_precision)
+                print("kpis	train_duration	%f" % (time.time() - time_begin))
+            time_begin = time.time()


 if __name__ == "__main__":
@@ -118,5 +142,6 @@ if __name__ == "__main__":
        emb_file="data/wordVectors.txt",
        model_save_dir="models",
        num_passes=1000,
+        batch_size=1,
        use_gpu=False,
        parallel=False)
--- a/fluid/sequence_tagging_for_ner/utils.py
+++ b/fluid/sequence_tagging_for_ner/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import re
+import argparse
+import numpy as np
+from collections import defaultdict
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype=float)
+
+
+def load_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes the first column (columns in a line are seperated by
+    tab) as key and takes line number of a line as the key (index of the word
+    in the dictionary).
+    """
+
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+
+
+def load_reverse_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes line number of a line as the key (index of the word in
+    the dictionary) and the first column (columns in a line are seperated by
+    tab) as the value.
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))