add time computation

d3183617 · chenguowei01 · dbad6f4c · d3183617 · d3183617 · d3183617
Showing with 98 addition and 7 deletion

dygraph/train.py dygraph/train.py +26 -4

dygraph/utils/__init__.py dygraph/utils/__init__.py +1 -0

dygraph/utils/timer.py dygraph/utils/timer.py +60 -0

dygraph/val.py dygraph/val.py +11 -3

未找到文件。
--- a/dygraph/train.py
+++ b/dygraph/train.py
@@ -14,6 +14,7 @@

 import argparse
 import os
+import time

 import paddle.fluid as fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
@@ -27,6 +28,7 @@ import utils.logging as logging
 from utils import get_environ_info
 from utils import load_pretrained_model
 from utils import resume
+from utils import Timer, calculate_eta
 from val import evaluate


@@ -111,6 +113,12 @@ def parse_args():
        dest='do_eval',
        help='Eval while training',
        action='store_true')
+    parser.add_argument(
+        '--log_steps',
+        dest='log_steps',
+        help='Display logging information at every log_steps',
+        default=10,
+        type=int)

    return parser.parse_args()

@@ -126,6 +134,7 @@ def train(model,
          pretrained_model=None,
          resume_model=None,
          save_interval_epochs=1,
+          log_steps=10,
          num_classes=None,
          num_workers=8):
    ignore_index = model.ignore_index
@@ -156,6 +165,10 @@ def train(model,
        return_list=True,
    )

+    timer = Timer()
+    timer.start()
+    steps_per_epoch = len(batch_sampler)
+    avg_loss = 0.0
    for epoch in range(start_epoch, num_epochs):
        for step, data in enumerate(loader):
            images = data[0]
@@ -170,11 +183,19 @@ def train(model,
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()
+            avg_loss += loss.numpy()[0]
            lr = optimizer.current_step_lr()
-            logging.info(
-                "[TRAIN] Epoch={}/{}, Step={}/{}, loss={}, lr={}".format(
-                    epoch + 1, num_epochs, step + 1, len(batch_sampler),
-                    loss.numpy(), lr))
+            if step % log_steps == 0:
+                avg_loss /= log_steps
+                time_step = timer.elapsed_time() / log_steps
+                remain_step = (num_epochs - epoch) * steps_per_epoch - step - 1
+                logging.info(
+                    "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}"
+                    .format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
+                            avg_loss, lr, time_step,
+                            calculate_eta(remain_step, time_step)))
+                avg_loss = 0.0
+                timer.restart()

        if ((epoch + 1) % save_interval_epochs == 0
                or epoch == num_epochs - 1) and ParallelEnv().local_rank == 0:
@@ -260,6 +281,7 @@ def main(args):
            pretrained_model=args.pretrained_model,
            resume_model=args.resume_model,
            save_interval_epochs=args.save_interval_epochs,
+            log_steps=args.log_steps,
            num_classes=train_dataset.num_classes,
            num_workers=args.num_workers)


--- a/dygraph/utils/__init__.py
+++ b/dygraph/utils/__init__.py
@@ -16,3 +16,4 @@ from . import logging
 from . import download
 from .metrics import ConfusionMatrix
 from .utils import *
+from .timer import Timer, calculate_eta
--- a/dygraph/utils/timer.py
+++ b/dygraph/utils/timer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+
+class Timer(object):
+    """ Simple timer class for measuring time consuming """
+
+    def __init__(self):
+        self._start_time = 0.0
+        self._end_time = 0.0
+        self._elapsed_time = 0.0
+        self._is_running = False
+
+    def start(self):
+        self._is_running = True
+        self._start_time = time.time()
+
+    def restart(self):
+        self.start()
+
+    def stop(self):
+        self._is_running = False
+        self._end_time = time.time()
+
+    def elapsed_time(self):
+        self._end_time = time.time()
+        self._elapsed_time = self._end_time - self._start_time
+        if not self.is_running:
+            return 0.0
+
+        return self._elapsed_time
+
+    @property
+    def is_running(self):
+        return self._is_running
+
+
+def calculate_eta(remaining_step, speed):
+    if remaining_step < 0:
+        remaining_step = 0
+    remaining_time = int(remaining_step * speed)
+    result = "{:0>2}:{:0>2}:{:0>2}"
+    arr = []
+    for i in range(2, -1, -1):
+        arr.append(int(remaining_time / 60**i))
+        remaining_time %= 60**i
+    return result.format(*arr)
--- a/dygraph/val.py
+++ b/dygraph/val.py
@@ -29,6 +29,7 @@ import models
 import utils.logging as logging
 from utils import get_environ_info
 from utils import ConfusionMatrix
+from utils import Timer, calculate_eta


 def parse_args():
@@ -96,12 +97,14 @@ def evaluate(model,
        places=places,
        return_list=True,
    )
-    total_steps = math.ceil(len(eval_dataset) * 1.0 / batch_size)
+    total_steps = len(batch_sampler)
    conf_mat = ConfusionMatrix(num_classes, streaming=True)

    logging.info(
        "Start to evaluating(total_samples={}, total_steps={})...".format(
            len(eval_dataset), total_steps))
+    timer = Timer()
+    timer.start()
    for step, data in enumerate(loader):
        images = data[0]
        labels = data[1].astype('int64')
@@ -113,8 +116,13 @@ def evaluate(model,
        conf_mat.calculate(pred=pred, label=labels, ignore=mask)
        _, iou = conf_mat.mean_iou()

-        logging.info("[EVAL] Epoch={}, Step={}/{}, iou={}".format(
-            epoch_id, step + 1, total_steps, iou))
+        time_step = timer.elapsed_time()
+        remain_step = total_steps - step - 1
+        logging.info(
+            "[EVAL] Epoch={}, Step={}/{}, iou={}, sec/step={:.4f} | ETA {}".
+            format(epoch_id, step + 1, total_steps, iou, time_step,
+                   calculate_eta(remain_step, time_step)))
+        timer.restart()

    category_iou, miou = conf_mat.mean_iou()
    category_acc, macc = conf_mat.accuracy()