add Resnet ce (#2502)

* add ce for dygraph mnist * add ce for dygraph mnist * del mnist_dygraph.py * change mnist_dygraph to train * fix print style * add resnet * fix ce bug * fix ce decsription

add Resnet ce (#2502)
* add ce for dygraph mnist * add ce for dygraph mnist * del mnist_dygraph.py * change mnist_dygraph to train * fix print style * add resnet * fix ce bug * fix ce decsription
c25124db · Divano · GitHub · 4bb42e25 · c25124db · c25124db
隐藏空白更改
内联并排

Showing with 100 addition and 6 deletion

dygraph/resnet/.run_ce.sh dygraph/resnet/.run_ce.sh +8 -0

dygraph/resnet/_ce.py dygraph/resnet/_ce.py +70 -0

dygraph/resnet/train.py dygraph/resnet/train.py +22 -6

未找到文件。
--- a/dygraph/resnet/.run_ce.sh
+++ b/dygraph/resnet/.run_ce.sh
+#!/bin/bash
+
+# This file is only used for continuous evaluation.
+# dygraph single card
+export FLAGS_cudnn_deterministic=True
+export CUDA_VISIBLE_DEVICES=0
+python train.py --ce --epoch 1 --batch_size 128 | python _ce.py
+
--- a/dygraph/resnet/_ce.py
+++ b/dygraph/resnet/_ce.py
+####this file is only used for continuous evaluation test!
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_acc1 = AccKpi('train_acc1', 0.01, 0, actived=True, desc="train acc1")
+train_acc5 = AccKpi('train_acc5', 0.01, 0, actived=True, desc="train acc5")
+train_loss = CostKpi('train_loss', 0.01, 0, actived=True, desc="train loss")
+test_acc1 = AccKpi('test_acc1', 0.01, 0, actived=True, desc='test acc1')
+test_acc5 = AccKpi('test_acc5', 0.01, 0, actived=True, desc='test acc5')
+test_loss = CostKpi('test_loss', 0.01, 0, actived=True, desc='test loss')
+#train_speed_kpi = DurationKpi(
+#    'train_speed',
+#    0.05,
+#    0,
+#    actived=True,
+#    unit_repr='seconds/image',
+#    desc='train speed in one GPU card')
+tracking_kpis = [train_acc1, train_acc5, train_loss,
+                 test_acc1, test_acc5, test_loss]
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
--- a/dygraph/resnet/train.py
+++ b/dygraph/resnet/train.py
@@ -26,8 +26,6 @@ from paddle.fluid import framework
 import math
 import sys

-batch_size = 32
-epoch = 120
 IMAGENET1000 = 1281167
 base_lr = 0.1
 momentum_rate = 0.9
@@ -35,18 +33,21 @@ l2_decay = 1e-4


 def parse_args():
-    parser = argparse.ArgumentParser("Training for Mnist.")
+    parser = argparse.ArgumentParser("Training for Resnet.")
    parser.add_argument(
        "--use_data_parallel",
        type=ast.literal_eval,
        default=False,
        help="The flag indicating whether to shuffle instances in each pass.")
+    parser.add_argument("-e", "--epoch", default=120, type=int, help="set epoch")
+    parser.add_argument("-b", "--batch_size", default=32, type=int, help="set epoch")
+    parser.add_argument("--ce", action="store_true", help="run ce")
    args = parser.parse_args()
    return args


 args = parse_args()
-
+batch_size = args.batch_size

 def optimizer_setting():

@@ -263,16 +264,28 @@ def eval(model, data):
            print("test | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
                  ( batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample))
+    if args.ce:
+        print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample))
+        print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample))
+        print("kpis\ttest_loss\t%0.3f" % (total_loss / total_sample))
    print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
          (total_loss / total_sample, \
           total_acc1 / total_sample, total_acc5 / total_sample))


 def train_resnet():
+    epoch = args.epoch
    trainer_count = fluid.dygraph.parallel.Env().nranks
    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
        if args.use_data_parallel else fluid.CUDAPlace(0)
    with fluid.dygraph.guard(place):
+        if args.ce:
+            print("ce mode")
+            seed = 33
+            np.random.seed(seed)
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
        if args.use_data_parallel:
            strategy = fluid.dygraph.parallel.prepare_context()

@@ -340,24 +353,27 @@ def train_resnet():
                optimizer.minimize(avg_loss)
                resnet.clear_gradients()

-                framework._dygraph_tracer_._clear_ops()

                total_loss += dy_out
                total_acc1 += acc_top1.numpy()
                total_acc5 += acc_top5.numpy()
                total_sample += 1
-
                #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
                if batch_id % 10 == 0:
                    print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
                           ( eop, batch_id, total_loss / total_sample, \
                             total_acc1 / total_sample, total_acc5 / total_sample))

+            if args.ce:
+                print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
+                print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
+                print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
                  (eop, batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample))
            resnet.eval()
            eval(resnet, test_reader)
+            fluid.dygraph.save_persistables(resnet.state_dict(), 'resnet_params')


 if __name__ == '__main__':