digraph G { subgraph cluster_init { label="Initialization" startup_program [label="startup", shape=box] node_w_g0 [label="W\nGPU0"] startup_program -> node_w_g0 [label="Initialize"] node_w_g1 [label="W\nGPU1"] node_w_g0 -> node_w_g1 [label="broadcast"] } subgraph cluster_train { label="forward_backward" subgraph cluster_gpu0 { label="GPU0" fc_0 [label="fc\nGPU0", shape=box] hidden_0 [label="hidden\nGPU0"] node_w_g0 -> fc_0 fc_0 -> hidden_0 loss0 [label="loss\nGPU0"] hidden_0 -> loss0 [label="many ops omitted"] scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box] loss_g0 [label="loss_grad\nGPU0"] scale_loss_0->loss_g0 fc_g_0 [label="w_grad\nGPU0", shape=box] loss0 -> fc_g_0 loss_g0 -> fc_g_0 hidden_0 -> fc_g_0 } subgraph cluster_gpu1 { label="GPU1" fc_1 [label="fc\nGPU1", shape=box] hidden_1 [label="hidden\nGPU1"] node_w_g1 -> fc_1 fc_1 -> hidden_1 loss1 [label="loss\nGPU1"] hidden_1 -> loss1 [label="many ops omitted"] scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box] loss_g1 [label="loss_grad\nGPU1"] scale_loss_1->loss_g1 fc_g_1 [label="w_grad\nGPU1", shape=box] loss1 -> fc_g_1 loss_g1 -> fc_g_1 hidden_1 -> fc_g_1 } } all_reduce_w [label="Merge Gradients(AllReduce)", shape=box] fc_g_0 -> all_reduce_w fc_g_1 -> all_reduce_w fc_g_0_merged [label="w_grad\nMerged\nGPU0"] fc_g_1_merged [label="w_grad\nMerged\nGPU1"] all_reduce_w -> fc_g_0_merged all_reduce_w -> fc_g_1_merged subgraph cluster_optimization { label="Optimization" subgraph cluster_opt_gpu0 { label="GPU0" sgd_0 [label="SGD Op\nGPU0", shape=box] fc_g_0_merged -> sgd_0 node_w_g0 -> sgd_0 optimized_w_0 [label="Optimized W\nGPU0"] sgd_0 -> optimized_w_0 } subgraph cluster_opt_gpu1 { label="GPU1" sgd_1 [label="SGD Op\nGPU1", shape=box] fc_g_1_merged -> sgd_1 node_w_g1 -> sgd_1 optimized_w_1 [label="Optimized W\nGPU0"] sgd_1 -> optimized_w_1 } } }