提交 98ad0f9c 编写于 作者: O ouyangyu

resnet print utilization.gpu memory.used

上级 a5460d74
...@@ -71,6 +71,7 @@ def get_parser(parser=None): ...@@ -71,6 +71,7 @@ def get_parser(parser=None):
parser.add_argument( parser.add_argument(
"--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla" "--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla"
) )
parser.add_argument( parser.add_argument(
"--channel_last", "--channel_last",
type=str2bool, type=str2bool,
...@@ -94,6 +95,9 @@ def get_parser(parser=None): ...@@ -94,6 +95,9 @@ def get_parser(parser=None):
parser.add_argument("--batch_size_per_device", type=int, default=64) parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--val_batch_size_per_device", type=int, default=8) parser.add_argument("--val_batch_size_per_device", type=int, default=8)
parser.add_argument(
"--use_rdma", type=str2bool, nargs="?", const=True, help="Use rdma.",
)
parser.add_argument( parser.add_argument(
"--nccl_fusion_threshold_mb", "--nccl_fusion_threshold_mb",
type=int, type=int,
......
...@@ -64,6 +64,8 @@ if args.nccl_fusion_threshold_mb: ...@@ -64,6 +64,8 @@ if args.nccl_fusion_threshold_mb:
if args.nccl_fusion_max_ops: if args.nccl_fusion_max_ops:
flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops) flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops)
if args.num_nodes > 1 and args.use_rdma:
flow.config.use_rdma(True)
def label_smoothing(labels, classes, eta, dtype): def label_smoothing(labels, classes, eta, dtype):
assert classes > 0 assert classes > 0
......
...@@ -94,6 +94,7 @@ class Metric(object): ...@@ -94,6 +94,7 @@ class Metric(object):
prediction_key="predictions", prediction_key="predictions",
label_key="labels", label_key="labels",
loss_key=None, loss_key=None,
nvidia_smi_report_step=10,
): ):
self.desc = desc self.desc = desc
self.calculate_batches = calculate_batches self.calculate_batches = calculate_batches
...@@ -101,6 +102,7 @@ class Metric(object): ...@@ -101,6 +102,7 @@ class Metric(object):
self.prediction_key = prediction_key self.prediction_key = prediction_key
self.label_key = label_key self.label_key = label_key
self.loss_key = loss_key self.loss_key = loss_key
self.nvidia_smi_report_step = nvidia_smi_report_step
if loss_key: if loss_key:
self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}" self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}"
else: else:
...@@ -121,6 +123,10 @@ class Metric(object): ...@@ -121,6 +123,10 @@ class Metric(object):
def callback(outputs): def callback(outputs):
if step == 0: if step == 0:
self._clear() self._clear()
if self.loss_key and epoch == 0 and step == self.nvidia_smi_report_step:
cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
os.system(cmd)
if self.prediction_key: if self.prediction_key:
num_matched, num_samples = match_top_k( num_matched, num_samples = match_top_k(
outputs[self.prediction_key], outputs[self.label_key] outputs[self.prediction_key], outputs[self.label_key]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册