# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import queue import copy from enum import Enum import numpy as np import paddle from paddle.fluid import core from paddle.distributed.fleet.meta_optimizers.common import OpRole SUCC = 0 # successor PRED = 1 # predecessor class CostNodeType(Enum): DEFAULT = 0 COMPUTATION = 1 COMMUNICATION = 2 VARIABLE = 3 MERGED = 4 NOP = 5 class Cost(object): def __init__(self): self.runtime = None self.static_mem = None self.peak_mem = None class CostModelMode(Enum): DEFAULT = 0 BENCHMARKING = 1 # costs based on trial runs ANALYSIS = 2 # costs based on analysis MIXED = 3 class CostNode(object): def __init__(self, node, node_type, id=None): self.id = id self.node = node self.type = node_type self._cost = 0 self.is_optim = False self.is_bwd = False @property def cost(self): return self._cost @cost.setter def cost(self, cost): if cost < 0: raise ValueError('Cost must be above 0.') self._cost = cost class MergedOpsCostNode(CostNode): def __init__(self, node_type, id=None, base_node_list=None, is_bwd=False): super(MergedOpsCostNode, self).__init__(None, node_type, id) self.node_list = base_node_list self.is_bwd = is_bwd class CommOpCostNode(CostNode): def __init__(self, node, node_type, id=None, comm_node_list=None, is_bwd=False): super(CommOpCostNode, self).__init__(node, node_type, id) self.node_list = comm_node_list self.ranks = [] self.comm_type = node.type self.is_bwd = is_bwd def set_ranks(self, ranks): self.ranks = ranks def set_shapes(self, input_shape, output_shape): self.input_shape = input_shape self.output_shape = output_shape def init_comm_cost(self, cluster=None): # ref: https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md # should get from `cluster` BANDWIDTH = 32 * 1024 / 1000 # MB/ms, V100 PCIe num_ranks = len(self.ranks) comm_volumn = np.prod(self.input_shape) * 4 if 'allreduce' in self.comm_type: self._cost = comm_volumn / (BANDWIDTH * num_ranks / (2 * (num_ranks - 1))) elif 'gather' in self.comm_type: self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1)) elif 'broadcast' in self.comm_type: self._cost = comm_volumn / BANDWIDTH elif 'send' in self.comm_type or 'recv' in self.comm_type: self._cost = comm_volumn / BANDWIDTH else: self._cost = 0 class TensorCostNode(CostNode): def __init__(self, node, node_type, id=None, base_node_list=None, batch_size=None, shared_node_id=None): super(TensorCostNode, self).__init__(node, node_type, id) if node.name == "create_py_reader_0" or node.name == "double_buffer_0": self.shape = [2, 2] self.dtype = paddle.float32 else: self.shape = node.shape self.dtype = node.dtype self.dtype_factor = 1 self.persistable = None self.shared_node_id = shared_node_id if self.dtype == paddle.float32 or node.dtype == paddle.int32: self.dtype_factor *= 4 elif node.dtype == paddle.int64: self.dtype_factor *= 8 elif node.dtype == paddle.uint8: self.dtype_factor = 1 else: raise NotImplementedError("{} not counted".format(node.dtype)) self.batch_size = None if batch_size is not None: self.batch_size = batch_size def get_size(self): p = 1 for i in self.node.shape: if i == -1: # deal with placeholder assert self.batch_size is not None, "Batch size not decided." i = self.batch_size p *= i return p class CompOpCostNode(CostNode): def __init__(self, node, node_type, id=None, is_bwd=False, is_optim=False): super(CompOpCostNode, self).__init__(node, node_type, id) self.is_bwd = is_bwd self.is_optim = is_optim def init_comp_cost(self, cost_data): # TODO: improve fluid.CostModel for more specific cost_data op_id = self.node.desc.id() if op_id in cost_data.keys(): self.cost = cost_data[op_id] else: self.cost = 0.0 class PipeEvent(object): def __init__(self, stage_id, event_name, duration, start_time=-1): self.stage_id = stage_id self.name = event_name self.duration = duration self.s_time = start_time self.e_time = -1 class CostModel(object): def __init__(self, mode=CostModelMode.BENCHMARKING, cluster=None, batch_size=1, microbatch_num=1, opcall_overhead=0, standalone_cost_data=None, pipeline_config=None): self.mode = mode # parameters self.opcall_overhead = opcall_overhead self.batch_size = batch_size self.microbatch_num = microbatch_num self.nodes = {} # name -> node self.origin_graph = {} # original graph self.op_graph = {} # op graph (no variables nodes) self.runtime_graph = {} # runtime graph, for simulation self.cluster = cluster self.cost_data = standalone_cost_data self.pp2rank = pipeline_config if self.pp2rank is not None: self.rank2pp = {} for stage_idx, ranks in enumerate(self.pp2rank): for rank in ranks: self.rank2pp[rank] = stage_idx else: self.rank2pp = None self.ring2rank = {} self.fwd_time = [] self.bwd_time = [] self.optim_time = [] def _parse_sub_program(self, program, nodes, graph, cost_data, sub_idx): assert len( program.blocks) == 1, "Program more than 1 block not supported." block = program.blocks[0] var_id = "lod_tensor_blocking_queue_0" new_var = program.global_block().create_var( name=var_id, dtype=paddle.float32, type=core.VarDesc.VarType.LOD_TENSOR) nodes[var_id] = TensorCostNode(new_var, CostNodeType.VARIABLE, "lod_tensor_blocking_queue_0") for var in block.vars.values(): var_id = var.name # if var.name == "create_py_reader_0" or var.name == "double_buffer_0": # continue nodes[var_id] = TensorCostNode(var, CostNodeType.VARIABLE, var_id) graph[var_id] = [[], []] for op in block.ops: op_id = op.type + "_" + str(op.idx) if op.type.startswith('c_') or op.type.startswith( 'send') or op.type.startswith('recv'): is_bwd = False if op.type.startswith( 'c_' ) and op.type != "c_sync_calc_stream" and not op.type.startswith( 'c_embedding'): ring_id = op.attr('ring_id') if ring_id not in self.ring2rank: self.ring2rank[ring_id] = set() self.ring2rank[ring_id].add(sub_idx) is_bwd = '@GRAD' in op.output('Out')[0] elif op.type.startswith('recv'): is_bwd = '@GRAD' in op.output('Out')[0] elif op.type.startswith('send'): is_bwd = '@GRAD' in op.input('X')[0] op_node = CommOpCostNode(op, CostNodeType.COMMUNICATION, op_id, is_bwd) else: is_bwd = (int(op.attr('op_role')) == int(OpRole.Backward) ) or "@GRAD" in op.input_arg_names is_optim = 'LearningRate' in op.input_names op_node = CompOpCostNode(op, CostNodeType.COMPUTATION, op_id, is_bwd, is_optim) op_node.init_comp_cost(cost_data) nodes[op_id] = op_node graph[op_id] = [[], []] comm_input_shape = [0] comm_output_shape = [0] for i in range(len(op.input_names)): try: var_id = op.input(op.input_names[i])[0] var_node = nodes[var_id] graph[op_id][PRED].append(var_node.id) graph[var_id][SUCC].append(op_node.id) comm_input_shape = var_node.shape except: continue for i in range(len(op.output_names)): try: var_id = op.output(op.output_names[i])[0] var_node = nodes[var_id] graph[op_id][SUCC].append(var_node.id) graph[var_id][PRED].append(op_node.id) comm_output_shape = var_node.shape except: continue if op_node.type == CostNodeType.COMMUNICATION: op_node.set_shapes(comm_input_shape, comm_output_shape) # resolve hazard: rename the r/w hazard variable nodes to ensure self.origin_graph is a DAG new_var_dict = {} for node_id, node in nodes.items(): if node.type == CostNodeType.VARIABLE and node.node.persistable: write_op_cnt = 0 for pred_id in graph[node_id][PRED]: pred = nodes[pred_id] if pred.type == CostNodeType.COMPUTATION and ( pred_id in graph[node_id][SUCC]): graph[pred_id][SUCC].remove(node_id) graph[node_id][PRED].remove(pred_id) write_op_cnt += 1 new_var_id = node_id + '_write_{}'.format(write_op_cnt) new_var = TensorCostNode( node.node, CostNodeType.VARIABLE, new_var_id, shared_node_id=node_id) graph[new_var_id] = [[], []] graph[pred_id][SUCC].append(new_var_id) graph[new_var_id][PRED].append(pred_id) new_var_dict[new_var_id] = new_var for k, v in new_var_dict.items(): nodes[k] = v return nodes def parse_program(self, distributed_program): self.distributed_program = distributed_program self.total_rank = len(self.distributed_program) sub_prog_cnt = len(distributed_program) self.nodes = [] * sub_prog_cnt self.origin_graph = [] * sub_prog_cnt # original graph self.op_graph = [] * sub_prog_cnt # op graph (no variables nodes) self.runtime_graph = [] * sub_prog_cnt # runtime graph, for simulation for sub_idx, sub_prog in enumerate(distributed_program): self.nodes.append({}) self.origin_graph.append({}) self.op_graph.append({}) self.runtime_graph.append({}) self._parse_sub_program( sub_prog, self.nodes[sub_idx], self.origin_graph[sub_idx], self.cost_data[0 if self.rank2pp is None else self.rank2pp[ sub_idx]], sub_idx) return self.nodes def _find_succ_op(self, node_id, sub_idx=0): succ_ops_id = [] for succ_id in self.origin_graph[sub_idx][node_id][SUCC]: succ = self.nodes[sub_idx][succ_id] if succ.type == CostNodeType.COMMUNICATION or \ succ.type == CostNodeType.COMPUTATION: succ_ops_id.append(succ_id) elif succ.type == CostNodeType.VARIABLE: succ_ops_id = succ_ops_id + self._find_succ_op(succ_id, sub_idx) else: raise NotImplementedError( 'This type of node not supported yet:{}'.format(succ.type)) return succ_ops_id def build_op_graph(self): for sub_idx in range(self.total_rank): op_nodes_id = [] for node_id, node in self.nodes[sub_idx].items(): if node.type == CostNodeType.VARIABLE: continue self.op_graph[sub_idx][node_id] = [[], []] op_nodes_id.append(node_id) for op_id in op_nodes_id: succ_nodes_id = self._find_succ_op(op_id, sub_idx) self.op_graph[sub_idx][op_id][SUCC] = succ_nodes_id for succ_id in succ_nodes_id: self.op_graph[sub_idx][succ_id][PRED].append(op_id) def build_runtime_graph(self): self.runtime_graph = copy.deepcopy(self.op_graph) def eliminate_multi_edges(self, graph=None): for node_id, edges in graph.items(): graph[node_id][PRED] = list(set(edges[PRED])) graph[node_id][SUCC] = list(set(edges[SUCC])) def merge_comm(self): for sub_idx in range(self.total_rank): for node_id, edges in self.op_graph[sub_idx].items(): node = self.nodes[sub_idx][node_id] if node_id.startswith('c_') and not node.id.startswith( "c_sync_calc_stream") and not node.id.startswith( 'c_embedding'): ring_id = node.node.attr('ring_id') node.set_ranks(list(self.ring2rank[ring_id])) node.init_comm_cost(self.cluster) elif node_id.startswith('send') or node_id.startswith('recv'): peer_rank = node.node.attr('peer') node.set_ranks([sub_idx, peer_rank]) node.init_comm_cost(self.cluster) else: pass # Not communication op def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None): nodes_list = [] node_cost = 0 for node in to_merge_node_list: if isinstance(node, MergedOpsCostNode): nodes_list += node.node_list else: nodes_list.append(node.id) if merge_type == 'linear': node_cost += node.cost elif merge_type == 'branch': node_cost = max(node_cost, node.cost) else: raise NotImplementedError( 'This type of merging is not supported:{}'.format( merge_type)) merged_node_id = 'merged_' + str(len(nodes)) is_bwd = to_merge_node_list[0].is_bwd merged_node = MergedOpsCostNode( CostNodeType.MERGED, id=merged_node_id, base_node_list=nodes_list, is_bwd=is_bwd) merged_node.cost = node_cost return merged_node_id, merged_node def merge_linear(self): ''' This method does the following: If X depends on Y only, they must be run sequentially. [ e.g. A ->- C ->- D D and E depends on C only.] [ B ->-/ \->- E C depends on A and B. ] We merge X and Y into a new node and sum up their cost time. ''' cnt = 0 for sub_idx in range(self.total_rank): cnt += self._merge_linear( self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) cnt += self._merge_linear( self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) return cnt def merge_branch(self): ''' This method does the following: If a node has more than one successor, there is *branch*. [ e.g. A ->- B ->- D ] [ \->- C ->- / , B and C can be run at the same time ] case 1: if B or C is null (or D is directly dependent on A), it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear case 2: if both B and C are some op, merged_cost = max(cost(B), cost(C)) ''' cnt = 0 for sub_idx in range(self.total_rank): cnt += self._merge_branch( self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) cnt += self._merge_branch( self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) return cnt def _merge_linear(self, nodes, runtime_graph, is_bwd=False): reduct_cnt = 0 rt_nodes_id = list(runtime_graph.keys()) for node_id in rt_nodes_id: if node_id not in runtime_graph.keys(): continue node = nodes[node_id] if not is_bwd == node.is_bwd or node.is_optim: continue edges = runtime_graph[node_id] ind = len(edges[PRED]) # in_degree if ind == 1: # only depend on one node pred_id = edges[PRED][0] pred = nodes[pred_id] merged_node_id, merged_node = self._merge_node( [node, pred], merge_type='linear', nodes=nodes) nodes[merged_node_id] = merged_node runtime_graph[merged_node_id] = [[], []] # delete edges and add new edges succ = None try: runtime_graph[merged_node_id][SUCC] = copy.deepcopy(edges[ SUCC]) if len(runtime_graph[pred_id][SUCC]) > 1: # predecessor has more than 1 successor # the merged_node is to inherit the rest of its successors succ = runtime_graph[pred_id][SUCC] succ.remove(node_id) runtime_graph[merged_node_id][SUCC] += succ runtime_graph[merged_node_id][PRED] = runtime_graph[ pred_id][PRED] except: pass try: for i in runtime_graph[pred_id][PRED]: try: runtime_graph[i][SUCC].remove(pred_id) except: continue runtime_graph[i][SUCC].append(merged_node_id) except: pass try: for i in edges[SUCC]: runtime_graph[i][PRED].remove(node_id) runtime_graph[i][PRED].append(merged_node_id) except: pass if succ is not None: for i in succ: try: runtime_graph[i][PRED].remove(pred_id) except: continue runtime_graph[i][PRED].append(merged_node_id) runtime_graph.pop(node_id) try: runtime_graph.pop(pred_id) except: continue reduct_cnt += 1 self.eliminate_multi_edges(runtime_graph) break return reduct_cnt # the number of nodes that have been reduced def _merge_branch(self, nodes, runtime_graph, is_bwd=False): reduct_cnt = 0 rt_nodes_id = list(runtime_graph.keys()) for node_id in rt_nodes_id: node = nodes[node_id] if not is_bwd == node.is_bwd or node.is_optim: continue edges = runtime_graph[node_id] outd = len(edges[SUCC]) # out_degree if outd > 1: # branch out succ_nodes_id = edges[SUCC] succ_to_elim = [] for succ_id in succ_nodes_id: for succ_2_id in succ_nodes_id: try: tmp = runtime_graph[succ_2_id][SUCC] except: continue if succ_id in tmp: succ_to_elim.append(succ_id) break for id in succ_to_elim: edges[SUCC].remove(id) runtime_graph[id][PRED].remove(node_id) reduct_cnt += 1 to_merge = True try: if len(edges[SUCC]) < 1 or len(runtime_graph[edges[SUCC][0]] [SUCC]) < 1: continue except: continue end_node_id = runtime_graph[edges[SUCC][0]][SUCC][0] for i in succ_nodes_id: try: if len(runtime_graph[i][SUCC]) != 1 or \ runtime_graph[i][SUCC][0] != end_node_id: to_merge = False # if branches has different end node, we don't merge them break except: continue if to_merge and len(succ_nodes_id) > 1: to_merge_node_list = [nodes[i] for i in succ_nodes_id] merged_node_id, merged_node = self._merge_node( to_merge_node_list, merge_type='branch', nodes=nodes) nodes[merged_node_id] = merged_node runtime_graph[merged_node_id] = [[], []] # delete edges and add new edges runtime_graph[merged_node_id][SUCC] = [end_node_id] runtime_graph[merged_node_id][PRED] = edges[PRED] runtime_graph[end_node_id][PRED] = [merged_node_id] runtime_graph[node_id][SUCC] = [merged_node_id] try: for i in succ_nodes_id: runtime_graph.pop(i) reduct_cnt += len(to_merge_node_list) - 1 break except: pass return reduct_cnt def get_runtime_cost(self): def get_node_cost(node): node_cost = node.cost + self.opcall_overhead if isinstance(node, MergedOpsCostNode): for it in node.node_list: node_cost += self.opcall_overhead return node_cost for sub_idx in range(self.total_rank): fwd_cost = 0 bwd_cost = 0 optim_cost = 0 for node_id in self.runtime_graph[sub_idx].keys(): node = self.nodes[sub_idx][node_id] if node.is_optim: optim_cost += get_node_cost(node) elif node.is_bwd: bwd_cost += get_node_cost(node) else: fwd_cost += get_node_cost(node) self.fwd_time.append(fwd_cost) self.bwd_time.append(bwd_cost) self.optim_time.append(optim_cost) return self.fwd_time, self.bwd_time, self.optim_time def get_mem(self): static_list = [] top_list = [] for sub_idx in range(self.total_rank): static_mem, cur_mem, top_mem = self._simulate_mem( self.nodes[sub_idx], self.origin_graph[sub_idx]) static_list.append(static_mem) top_list.append(top_mem) return static_list, top_list def _simulate_mem(self, nodes, origin_graph): q = queue.Queue(1024) sim_graph = copy.deepcopy(origin_graph) for node_id, node in nodes.items(): if len(sim_graph[node_id][PRED]) == 0: q.put(node_id) q.put('nop') cur_mem = 0 top_mem = -1 static_mem = 0 while not q.empty(): node_id = q.get() node = None size = 0 if node_id == 'nop': top_mem = max(cur_mem, top_mem) if q.empty(): break else: q.put(node_id) continue else: node = nodes[node_id] if node.type == CostNodeType.VARIABLE: size = node.get_size() if node.node.persistable: static_mem += size cur_mem += size edges = sim_graph[node_id] if not (node.type == CostNodeType.VARIABLE and node.node.persistable): for succ_id in edges[SUCC]: sim_graph[succ_id][PRED].remove(node_id) if len(sim_graph[succ_id][PRED]) == 0: q.put(succ_id) for pred_id in edges[PRED]: pred = nodes if pred.type == CostNodeType.VARIABLE: sim_graph[pred_id][SUCC].remove(node_id) if len(sim_graph[pred_id][ SUCC]) == 0 and not pred.node.persistable: cur_mem -= pred.get_size() return static_mem, cur_mem, top_mem def get_pipeline_time(self): if self.pp2rank is None: return self.fwd_time[0] + self.bwd_time[0] + self.optim_time[0] else: return self._simulate_pipeline() def _simulate_pipeline(self): stage_num = len(self.pp2rank) event_list = [] global_time = [0] * stage_num total_time = 0 fwd_cnt = list(range(stage_num, 0, -1)) bwd_cnt = [self.microbatch_num] * stage_num q = queue.Queue(1024) for i in range(self.microbatch_num): q.put(PipeEvent(0, 'fwd', self.fwd_time[0])) while not q.empty(): e = q.get() stid = e.stage_id if e.name == 'fwd': if fwd_cnt[stid] > 0: e.s_time = max(global_time[stid], e.s_time) e.e_time = e.s_time + e.duration event_list.append(e) if stid != stage_num - 1: q.put( PipeEvent( stid + 1, 'fwd', self.fwd_time[stid + 1], start_time=e.e_time)) else: q.put( PipeEvent( stid, 'bwd', self.bwd_time[stid], start_time=e.e_time)) fwd_cnt[stid] -= 1 global_time[stid] = e.e_time else: q.put(e) elif e.name == 'bwd': e.s_time = max(global_time[stid], e.s_time) e.e_time = e.s_time + e.duration event_list.append(e) if stid != 0: q.put( PipeEvent( stid - 1, 'bwd', self.bwd_time[stid - 1], start_time=e.e_time)) fwd_cnt[stid] += 1 bwd_cnt[stid] -= 1 if bwd_cnt[stid] == 0: q.put( PipeEvent( stid, 'optim', self.optim_time[stid], start_time=e.e_time)) global_time[stid] = e.e_time elif e.name == 'optim': e.s_time = max(global_time[stid], e.s_time) e.e_time = e.s_time + e.duration event_list.append(e) global_time[stid] = e.e_time else: raise NotImplementedError( 'This type of pipe event is not supported yet.{}'.format( e.name)) for t in global_time: total_time = max(total_time, t) return total_time def get_cost(self): cost = Cost() static_mem, peak_mem = self.get_mem() cost.static_mem = static_mem cost.peak_mem = peak_mem self.merge_comm() while True: cnt = 0 cnt += self.merge_linear() cnt += self.merge_branch() if cnt == 0: # can't be further merged break self.get_runtime_cost() cost.runtime = self.get_pipeline_time() return cost def init(self, distributed_program): self.parse_program(distributed_program) self.build_op_graph() for sub_idx in range(self.total_rank): self.eliminate_multi_edges(self.op_graph[sub_idx]) self.build_runtime_graph() def estimate_cost(distributed_program, cluster, pipeline_config, standalone_cost_data, batch_size): """ Estimated cost from distributed program, cluster model and distributed settings. Args: distributed_program(list): list of paddle programs cluster(Cluster): cluster model standalone_cost_data(CostData): cost data given by paddle.core batch_size(int): batch size of the training workload pipeline_config(list): configuration of pipeline stage allocation """ # the following line is left for now, cluster model will be involved in the future assert cluster is None, "For now, cluster remains None" cm_ctx = CostModel( cluster=cluster, batch_size=batch_size, standalone_cost_data=standalone_cost_data, pipeline_config=pipeline_config) cm_ctx.init(distributed_program) cost = cm_ctx.get_cost() return cost