Merge pull request #1116 from wanghaoshuang/ce_ocr

Add ce for ocr model.

Merge pull request #1116 from wanghaoshuang/ce_ocr
Add ce for ocr model.
97c2076f · guochaorong · GitHub · 776514b3 · 302b32dd · 97c2076f
4 changed file
--- a/fluid/ocr_recognition/.run.sh
+++ b/fluid/ocr_recognition/.run.sh
+python ctc_train.py --batch_size=128 --total_step=10000 --eval_period=10000 --log_period=10000 --use_gpu=True
--- a/fluid/ocr_recognition/.run_ce.sh
+++ b/fluid/ocr_recognition/.run_ce.sh
+python ctc_train.py --batch_size=128 --total_step=10000 -eval_period=10000 --log_period=10000 --use_gpu=True | python _ce.py
--- a/fluid/ocr_recognition/_ce.py
+++ b/fluid/ocr_recognition/_ce.py
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+# NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+test_acc_kpi = AccKpi('test_acc', 0.005, 0, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
+train_acc_kpi = AccKpi('train_acc', 0.005, 0, actived=True)
+
+tracking_kpis = [
+    train_acc_kpi,
+    train_cost_kpi,
+    test_acc_kpi,
+    train_duration_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
--- a/fluid/ocr_recognition/ctc_train.py
+++ b/fluid/ocr_recognition/ctc_train.py
@@ -103,6 +103,10 @@ def train(args, data_reader=ctc_reader):
        print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
            time.time(), iter_num, str(test_seq_error[0]))

+        #Note: The following logs are special for CE monitoring.
+        #Other situations do not need to care about these logs.
+        print "kpis	test_acc	%f" % (1 - test_seq_error[0])
+
    def save_model(args, exe, iter_num):
        filename = "model_%05d" % iter_num
        fluid.io.save_params(
@@ -111,6 +115,7 @@ def train(args, data_reader=ctc_reader):

    iter_num = 0
    stop = False
+    start_time = time.time()
    while not stop:
        total_loss = 0.0
        total_seq_error = 0.0
@@ -139,11 +144,15 @@ def train(args, data_reader=ctc_reader):
                    time.time(), iter_num,
                    total_loss / (args.log_period * args.batch_size),
                    total_seq_error / (args.log_period * args.batch_size))
+                print "kpis	train_cost	%f" % (total_loss / (args.log_period *
+                                                            args.batch_size))
+                print "kpis	train_acc	%f" % (
+                    1 - total_seq_error / (args.log_period * args.batch_size))
                sys.stdout.flush()
                total_loss = 0.0
                total_seq_error = 0.0

-            # evaluate
+# evaluate
            if not args.skip_test and iter_num % args.eval_period == 0:
                if model_average:
                    with model_average.apply(exe):
@@ -158,6 +167,8 @@ def train(args, data_reader=ctc_reader):
                        save_model(args, exe, iter_num)
                else:
                    save_model(args, exe, iter_num)
+        end_time = time.time()
+        print "kpis	train_duration	%f" % (end_time - start_time)
        # Postprocess benchmark data
        latencies = batch_times[args.skip_batch_num:]
        latency_avg = np.average(latencies)