Benchmark run info logging (#3708)

* Init test for logging benchmark run. * Fix collect CPU info. * Update max split for handling GPU information. * Another fix for parse GPU info. * Fix GPU and CPU info collector. * Update logging function to be static. * Remove the cifar10 logging and fix a lint error. * Address the review comment. * Fix lint error. * Fix lint error for logger and logger_test. * Another lint fix for the test. * Simplify the CPU info logging. We will start in a conserative way, and probably add more info in future. * Remove unused dependencies.

Benchmark run info logging (#3708)
* Init test for logging benchmark run. * Fix collect CPU info. * Update max split for handling GPU information. * Another fix for parse GPU info. * Fix GPU and CPU info collector. * Update logging function to be static. * Remove the cifar10 logging and fix a lint error. * Address the review comment. * Fix lint error. * Fix lint error for logger and logger_test. * Another lint fix for the test. * Simplify the CPU info logging. We will start in a conserative way, and probably add more info in future. * Remove unused dependencies.
d3952b2c · Qianli Scott Zhu · GitHub · 848b2f17 · d3952b2c · d3952b2c
隐藏空白更改
内联并排

Showing with 132 addition and 4 deletion

official/utils/logging/logger.py official/utils/logging/logger.py +99 -1

official/utils/logging/logger_test.py official/utils/logging/logger_test.py +33 -3

未找到文件。
--- a/official/utils/logging/logger.py
+++ b/official/utils/logging/logger.py
@@ -13,19 +13,35 @@
 # limitations under the License.
 # ==============================================================================

-"""Logging utilities for benchmark."""
+"""Logging utilities for benchmark.
+
+For collecting local environment metrics like CPU and memory, certain python
+packages need be installed. Run the following commands for dependency packages:
+  > pip install --upgrade py-cpuinfo
+  > pip install --upgrade psutil
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import datetime
 import json
+import multiprocessing
 import numbers
 import os

+# pylint: disable=g-bad-import-order
+# Note: cpuinfo and psutil are not installed in the TensorFlow OSS tree.
+# They are installable via pip.
+import cpuinfo
+import psutil
+# pylint: enable=g-bad-import-order
+
 import tensorflow as tf
+from tensorflow.python.client import device_lib

 _METRIC_LOG_FILE_NAME = "metric.log"
+_BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
 _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"


@@ -91,3 +107,85 @@ class BenchmarkLogger(object):
      except (TypeError, ValueError) as e:
        tf.logging.warning("Failed to dump metric to log file: "
                           "name %s, value %s, error %s", name, value, e)
+
+  def log_run_info(self, model_name):
+    """Collect most of the TF runtime information for the local env.
+
+    The schema of the run info follows official/benchmark/datastore/schema.
+
+    Args:
+      model_name: string, the name of the model.
+    """
+    run_info = {"model_name": model_name}
+    _collect_tensorflow_info(run_info)
+    _collect_tensorflow_environment_variables(run_info)
+    _collect_cpu_info(run_info)
+    _collect_gpu_info(run_info)
+    _collect_memory_info(run_info)
+
+    with tf.gfile.GFile(os.path.join(
+        self._logging_dir, _BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
+      try:
+        json.dump(run_info, f)
+        f.write("\n")
+      except (TypeError, ValueError) as e:
+        tf.logging.warning("Failed to dump benchmark run info to log file: %s",
+                           e)
+
+
+def _collect_tensorflow_info(run_info):
+  run_info["tensorflow_version"] = {
+      "version": tf.VERSION, "git_hash": tf.GIT_VERSION}
+
+
+def _collect_tensorflow_environment_variables(run_info):
+  run_info["tensorflow_environment_variables"] = {
+      k: v for k, v in os.environ.items() if k.startswith("TF_")}
+
+
+# The following code is mirrored from tensorflow/tools/test/system_info_lib
+# which is not exposed for import.
+def _collect_cpu_info(run_info):
+  """Collect the CPU information for the local environment."""
+  cpu_info = {}
+
+  cpu_info["num_cores"] = multiprocessing.cpu_count()
+
+  info = cpuinfo.get_cpu_info()
+  cpu_info["cpu_info"] = info["brand"]
+  cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6
+
+  run_info["cpu_info"] = cpu_info
+
+
+def _collect_gpu_info(run_info):
+  """Collect local GPU information by TF device library."""
+  gpu_info = {}
+  local_device_protos = device_lib.list_local_devices()
+
+  gpu_info["count"] = len([d for d in local_device_protos
+                           if d.device_type == "GPU"])
+  # The device description usually is a JSON string, which contains the GPU
+  # model info, eg:
+  # "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0"
+  for d in local_device_protos:
+    if d.device_type == "GPU":
+      gpu_info["model"] = _parse_gpu_model(d.physical_device_desc)
+      # Assume all the GPU connected are same model
+      break
+  run_info["gpu_info"] = gpu_info
+
+
+def _collect_memory_info(run_info):
+  vmem = psutil.virtual_memory()
+  run_info["memory_total"] = vmem.total
+  run_info["memory_available"] = vmem.available
+
+
+def _parse_gpu_model(physical_device_desc):
+  # Assume all the GPU connected are same model
+  for kv in physical_device_desc.split(","):
+    k, _, v = kv.partition(":")
+    if k.strip() == "name":
+      return v.strip()
+  return None
--- a/official/utils/logging/logger_test.py
+++ b/official/utils/logging/logger_test.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 import json
 import os
 import tempfile
+import unittest

 import tensorflow as tf  # pylint: disable=g-bad-import-order
+from tensorflow.python.client import device_lib

 from official.utils.logging import logger

@@ -88,9 +90,9 @@ class BenchmarkLoggerTest(tf.test.TestCase):
    self.assertFalse(tf.gfile.Exists(metric_log))

  def test_log_evaluation_result(self):
-    eval_result = {'loss': 0.46237424,
-                   'global_step': 207082,
-                   'accuracy': 0.9285}
+    eval_result = {"loss": 0.46237424,
+                   "global_step": 207082,
+                   "accuracy": 0.9285}
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
    log = logger.BenchmarkLogger(log_dir)
    log.log_estimator_evaluation_result(eval_result)
@@ -119,5 +121,33 @@ class BenchmarkLoggerTest(tf.test.TestCase):
    metric_log = os.path.join(log_dir, "metric.log")
    self.assertFalse(tf.gfile.Exists(metric_log))

+  def test_collect_tensorflow_info(self):
+    run_info = {}
+    logger._collect_tensorflow_info(run_info)
+    self.assertNotEqual(run_info["tensorflow_version"], {})
+    self.assertEqual(run_info["tensorflow_version"]["version"], tf.VERSION)
+    self.assertEqual(run_info["tensorflow_version"]["git_hash"], tf.GIT_VERSION)
+
+  def test_collect_tensorflow_environment_variables(self):
+    os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
+
+    run_info = {}
+    logger._collect_tensorflow_environment_variables(run_info)
+    self.assertIsNotNone(run_info["tensorflow_environment_variables"])
+    self.assertEqual(run_info["tensorflow_environment_variables"]
+                     ["TF_ENABLE_WINOGRAD_NONFUSED"], "1")
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), "requires GPU")
+  def test_collect_gpu_info(self):
+    run_info = {}
+    logger._collect_gpu_info(run_info)
+    self.assertNotEqual(run_info["gpu_info"], {})
+
+  def test_collect_memory_info(self):
+    run_info = {}
+    logger._collect_memory_info(run_info)
+    self.assertIsNotNone(run_info["memory_total"])
+    self.assertIsNotNone(run_info["memory_available"])
+
 if __name__ == "__main__":
  tf.test.main()