Add CE to dygraph Se-Resnext model (#2699)

Update mnist_dygraph.py fix bug * add ce to se_resnext * delete useless comments and fix unique_name bugs

Add CE to dygraph Se-Resnext model (#2699)
Update mnist_dygraph.py fix bug * add ce to se_resnext * delete useless comments and fix unique_name bugs
9d690da1 · Divano · lujun · 9d18809a · 9d690da1 · 9d690da1
Showing with 101 addition and 9 deletion

dygraph/se_resnext/.run_ce.sh dygraph/se_resnext/.run_ce.sh +9 -0

dygraph/se_resnext/_ce.py dygraph/se_resnext/_ce.py +64 -0

dygraph/se_resnext/train.py dygraph/se_resnext/train.py +28 -9

未找到文件。
--- a/dygraph/se_resnext/.run_ce.sh
+++ b/dygraph/se_resnext/.run_ce.sh
+#!/bin/bash
+# This file is only used for continuous evaluation.
+# dygraph single card
+export FLAGS_cudnn_deterministic=True
+export CUDA_VISIBLE_DEVICES=5
+python -u train.py --ce --epoch 1 | python _ce.py
+#python train.py --ce --epoch 1 | python _ce.py
--- a/dygraph/se_resnext/_ce.py
+++ b/dygraph/se_resnext/_ce.py
+####this file is only used for continuous evaluation test!
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+#### NOTE kpi.py should shared in models in some way!!!!
+train_acc1 = AccKpi('train_acc1', 0.01, 0, actived=True, desc="train acc1")
+train_acc5 = AccKpi('train_acc5', 0.01, 0, actived=True, desc="train acc5")
+train_loss = CostKpi('train_loss', 0.01, 0, actived=True, desc="train loss")
+test_acc1 = AccKpi('test_acc1', 0.01, 0, actived=True, desc='test acc1')
+test_acc5 = AccKpi('test_acc5', 0.01, 0, actived=True, desc='test acc5')
+test_loss = CostKpi('test_loss', 0.01, 0, actived=True, desc='test loss')
+tracking_kpis = [train_acc1, train_acc5, train_loss,
+                 test_acc1, test_acc5, test_loss]
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+    The suggestion:
+    each line in the log should be key, value, for example:
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/dygraph/se_resnext/train.py
+++ b/dygraph/se_resnext/train.py
@@ -25,7 +25,12 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
 import sys
 import math
+import argparse
+parser = argparse.ArgumentParser("Training for Se-ResNeXt.")
+parser.add_argument("-e", "--epoch", default=200, type=int, help="set epoch")
+parser.add_argument("--ce", action="store_true", help="run ce") 
+args = parser.parse_args()
 batch_size = 64
 train_parameters = {
    "input_size": [3, 224, 224],
@@ -324,12 +329,12 @@ def eval(model, data):
        label = to_variable(y_data)
        label._stop_gradient = True
        out = model(img)
-        cost,pred = fluid.layers.softmax_with_cross_entropy(out,label,return_softmax=True)
-        avg_loss = fluid.layers.mean(x=cost)
-        acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
+        softmax_out = fluid.layers.softmax(out,use_cudnn=False)
+        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
+        avg_loss = fluid.layers.mean(x=loss)
+        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
        dy_out = avg_loss.numpy()
        total_loss += dy_out
@@ -341,19 +346,28 @@ def eval(model, data):
                  ( batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample))
+    if args.ce:
+        print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample))
+        print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample))
+        print("kpis\ttest_loss\t%0.3f" % (total_loss / total_sample))
    print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
          (total_loss / total_sample, \
           total_acc1 / total_sample, total_acc5 / total_sample))
 def train():
-    seed = 90
    epoch_num = train_parameters["num_epochs"]
+    if args.ce:
+        epoch_num = args.epoch
    batch_size = train_parameters["batch_size"]
    with fluid.dygraph.guard():
-        fluid.default_startup_program().random_seed = 90
+        if args.ce:
-        fluid.default_main_program().random_seed = 90
+            print("ce mode")
+            seed = 90
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
        se_resnext = SeResNeXt("se_resnext")
        optimizer = optimizer_setting(train_parameters)
@@ -404,10 +418,15 @@ def train():
                total_acc5 += acc_top5.numpy()
                total_sample += 1
                if batch_id % 10 == 0:
+                    print(fluid.dygraph.base._print_debug_msg())
                    print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f lr %0.5f" % \
                           ( epoch_id, batch_id, total_loss / total_sample, \
                             total_acc1 / total_sample, total_acc5 / total_sample, lr))
+            if args.ce:
+                print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
+                print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
+                print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
                  (epoch_id, batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample))