未验证 提交 102ab268 编写于 作者: X XIE Xuan 提交者: GitHub

Merge pull request #212 from Oneflow-Inc/dev_resnet_print_memory_used

resnet print utilization.gpu memory.used
......@@ -71,6 +71,7 @@ def get_parser(parser=None):
parser.add_argument(
"--use_xla", type=str2bool, nargs="?", const=True, help="Whether to use use xla"
)
parser.add_argument(
"--channel_last",
type=str2bool,
......@@ -94,6 +95,9 @@ def get_parser(parser=None):
parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--val_batch_size_per_device", type=int, default=8)
parser.add_argument(
"--use_rdma", type=str2bool, nargs="?", const=True, help="Use rdma.",
)
parser.add_argument(
"--nccl_fusion_threshold_mb",
type=int,
......
......@@ -64,6 +64,8 @@ if args.nccl_fusion_threshold_mb:
if args.nccl_fusion_max_ops:
flow.config.collective_boxing.nccl_fusion_max_ops(args.nccl_fusion_max_ops)
if args.num_nodes > 1 and args.use_rdma:
flow.config.use_rdma(True)
def label_smoothing(labels, classes, eta, dtype):
assert classes > 0
......
......@@ -94,6 +94,7 @@ class Metric(object):
prediction_key="predictions",
label_key="labels",
loss_key=None,
nvidia_smi_report_step=10,
):
self.desc = desc
self.calculate_batches = calculate_batches
......@@ -101,6 +102,7 @@ class Metric(object):
self.prediction_key = prediction_key
self.label_key = label_key
self.loss_key = loss_key
self.nvidia_smi_report_step = nvidia_smi_report_step
if loss_key:
self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, top_1: {:.6f}, top_k: {:.6f}, samples/s: {:.3f}"
else:
......@@ -121,6 +123,10 @@ class Metric(object):
def callback(outputs):
if step == 0:
self._clear()
if self.loss_key and epoch == 0 and step == self.nvidia_smi_report_step:
cmd = "nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv"
os.system(cmd)
if self.prediction_key:
num_matched, num_samples = match_top_k(
outputs[self.prediction_key], outputs[self.label_key]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册