# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY from paddle.fluid import core from paddle.static import Program class TaskNode: """ Python side TaskNode, connection to the c++ side TaskNode """ def __init__(self, rank, max_run_times, max_slot_times, role=None, node_type=None, task_id=0, ops=None, program=None, lazy_initialize=False): """ :param rank (int): Current rank of the task node. :param max_run_times (int): The max run times of the task node. :param max_slot_times (int): The mas slot times of the task node. :param role (int): The role of the task node. (Will be removed in the future) :param node_type (str): The type of the task node. :param task_id (int): The id of task node. :param ops (list): A list of op.desc to init the task node. (Will be removed in the future) :param program (Program): An instance of Program to init the task node. :param lazy_initialize (bool): In user-defined task, the program may change adding feed/fetch op. As efficient consideration, the task node will have the C++ object later. """ assert ((ops is not None) ^ (program is not None)), \ "Should provide only one of ops or program to task node." assert (not ((ops is not None) and lazy_initialize)), \ "Lazy initialization doesn't support with ops list" self.id = int(task_id) self.rank = rank self.max_run_times = max_run_times self.max_slot_times = max_slot_times self.node_type = node_type self.program = program self.lazy_initialize = lazy_initialize self.run_pre_steps = None self.run_at_offset = None self.node = None self.upstreams = [] self.downstreams = [] if not lazy_initialize: if ops is not None: assert role is not None and task_id is not None, \ "If init task node with ops, should provide `role` and `task_id`." self.node = core.TaskNode(role, ops, rank, task_id, max_run_times, max_slot_times) else: self.node = core.TaskNode(program.desc, rank, self.id, max_run_times, max_slot_times) if self.node_type: self.node.set_type(self.node_type) def task_node(self): if self.lazy_initialize: self.node = core.TaskNode(self.program.desc, self.rank, self.id, self.max_run_times, self.max_slot_times) if self.node_type: self.node.set_type(self.node_type) if self.run_pre_steps: self.node.set_run_pre_steps(self.run_pre_steps) if self.run_at_offset: self.node.set_run_at_offset(self.run_at_offset) for up in self.upstreams: self.node.add_upstream_task(up[0], up[1]) for down in self.downstreams: self.node.add_downstream_task(down[0], down[1]) self.lazy_initialize = False return self.node def set_program(self, program): assert self.lazy_initialize, \ "Inside program is unchangable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()." self.program = program def get_program(self): assert self.program is not None, "The task node is not initialized using program" return self.program def set_run_pre_steps(self, steps): if self.lazy_initialize: self.run_pre_steps = steps else: self.node.set_run_pre_steps(steps) def set_run_at_offset(self, offset): if self.lazy_initialize: self.run_at_offset = offset else: self.node.set_run_at_offset(offset) def add_upstream_task(self, upstream, buffer_size=2): if self.lazy_initialize: self.upstreams.append((upstream, buffer_size)) else: self.node.add_upstream_task(upstream, buffer_size) def add_downstream_task(self, downstream, buffer_size=2): if self.lazy_initialize: self.downstreams.append((downstream, buffer_size)) else: self.node.add_downstream_task(downstream, buffer_size) def task_id(self): return self.id class CoordSys: """ This class is used to mapping rank to (mp rank, sharding rank, pp rank, dp rank). """ def __init__(self, dist_opt): self.dp_degree = dist_opt.get('dp_degree', 1) self.pp_degree = dist_opt.get('pp_degree', 1) self.sharding_degree = dist_opt.get('sharding_degree', 1) self.mp_degree = dist_opt.get('mp_degree', 1) def _invalide_coord(self, coord): """ Test the input coord is valid or not. :param coord: The coord to be tested :return: False if valid, True if invalid. """ return coord['mp_idx'] < 0 or coord['mp_idx'] >= self.mp_degree or \ coord['sharding_idx'] < 0 or coord['sharding_idx'] >= self.sharding_degree or \ coord['pp_idx'] < 0 or coord['pp_idx'] >= self.pp_degree or \ coord['dp_idx'] < 0 or coord['dp_idx'] >= self.dp_degree def coord_to_rank(self, coord): """ Map the input coord to it's corresponding rank. :param coord: The coord to be converted :return: The rank corresponding with the coord """ if self._invalide_coord(coord): return -1 return int(coord['dp_idx'] * self.pp_degree * self.sharding_degree * self.mp_degree + \ coord['pp_idx'] * self.sharding_degree * self.mp_degree + \ coord['sharding_idx'] * self.mp_degree + coord['mp_idx']) def rank_to_coord(self, rank): """ Map the input rank to it's corresponding coord :param rank: The rank to be converted :return: The coord corresponding with the rank """ mp_idx = rank % self.mp_degree rank //= self.mp_degree sharding_idx = rank % self.sharding_degree rank //= self.sharding_degree pp_idx = rank % self.pp_degree rank //= self.pp_degree dp_idx = rank % self.dp_degree return { 'mp_idx': int(mp_idx), 'sharding_idx': int(sharding_idx), 'pp_idx': int(pp_idx), 'dp_idx': int(dp_idx) } class FleetExecutorUtils: def __init__(self, dist_strategy=None, rank=None, nrank=None, max_run_times=None): self.dist_strategy = dist_strategy self.rank = rank self.nrank = nrank self.max_run_times = max_run_times self.is_auto_parallel = True if dist_strategy is None else False self.num_of_functionality = 4 self.coord_sys = None self.coord = None if dist_strategy: self.coord_sys = CoordSys(dist_strategy) self.coord = self.coord_sys.rank_to_coord(rank) def is_optimizer_op(self, op_role): return op_role == int(OpRole.Optimize) def is_lr_sched_op(self, op_role): return op_role == int(OpRole.Optimize.LRSched) def is_forward_op(self, op_role): return (op_role == int(OpRole.Forward)) or \ (op_role == (int(OpRole.Forward) | int(OpRole.Loss))) def is_backward_op(self, op_role): return (op_role == int(OpRole.Backward)) or \ (op_role == (int(OpRole.Backward) | int(OpRole.Loss))) def split_program_to_op_list(self, program): op_list_map = {"lr": [], "fwd": [], "bwd": [], "opt": []} for op in program.block(0).ops: # split the program based on the op_role op_role = int(op.all_attrs()[OP_ROLE_KEY]) if self.is_lr_sched_op(op_role): op_list_map["lr"].append(op) elif self.is_forward_op(op_role): op_list_map["fwd"].append(op) elif self.is_backward_op(op_role): op_list_map["bwd"].append(op) elif self.is_optimizer_op(op_role): op_list_map["opt"].append(op) else: raise "The op role: " + str( op_role ) + " isn't one of LRSched, Forward, Backward or Optimizer." return op_list_map def convert_op_list_to_program(self, op_list, complete_program): #TODO(liyurui): Complete this convert logic program_map = { "lr": Program(), "fwd": Program(), "bwd": Program(), "opt": Program() } return program_map def build_1f1b_dependency(self, task_node_map): assert not self.is_auto_parallel, "Handly add dependency should not be invoked in auto parallel mode" # Generated the dependency based on this graph: # lr(1:m) -> forward -> backward -> (m:1)optimize # ↑ ↓ # lr(1:m) -> forward -> backward -> (m:1)optimize # ↑ ↓ # lr(1:m) -> forward -> backward -> (m:1)optimize # add dependency intra stage cur_start_id = self.rank * self.num_of_functionality pp_buff_size = int(self.dist_strategy['pp_degree'] - self.coord['pp_idx']) task_node_map["lr"].add_downstream_task(cur_start_id + 1) task_node_map["fwd"].add_upstream_task(cur_start_id) task_node_map["fwd"].add_downstream_task(cur_start_id + 2, pp_buff_size) task_node_map["bwd"].add_upstream_task(cur_start_id + 1, pp_buff_size) task_node_map["bwd"].add_downstream_task(cur_start_id + 3) task_node_map["opt"].add_upstream_task(cur_start_id + 2) # add dependency inter stage upstream_coord, downstream_coord = self.coord.copy(), self.coord.copy() upstream_coord['pp_idx'] = upstream_coord['pp_idx'] - 1 downstream_coord['pp_idx'] = downstream_coord['pp_idx'] + 1 pp_upstream = self.coord_sys.coord_to_rank(upstream_coord) pp_downstream = self.coord_sys.coord_to_rank(downstream_coord) first_stage = (pp_upstream == -1) last_stage = (pp_downstream == -1) prev_pp_start_id = pp_upstream * self.num_of_functionality next_pp_start_id = pp_downstream * self.num_of_functionality if not first_stage: task_node_map["fwd"].add_upstream_task(prev_pp_start_id + 1) task_node_map["bwd"].add_downstream_task(prev_pp_start_id + 2) if not last_stage: task_node_map["fwd"].add_downstream_task(next_pp_start_id + 1) task_node_map["bwd"].add_upstream_task(next_pp_start_id + 2) return task_node_map def construct_task_nodes_1f1b(self, program_map): max_slot_times = int(self.max_run_times - self.coord['pp_idx']) cur_start_id = int(self.rank * self.num_of_functionality) lr_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, program=program_map["lr"], task_id=cur_start_id) fwd_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, program=program_map["fwd"], task_id=cur_start_id + 1) bwd_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, program=program_map["bwd"], task_id=cur_start_id + 2) opt_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, program=program_map["opt"], task_id=cur_start_id + 3) return { "lr": lr_task_node, "fwd": fwd_task_node, "bwd": bwd_task_node, "opt": opt_task_node } def task_id_to_rank(self): task_id_to_rank = {} for i in range(self.nrank): for j in range(self.num_of_functionality): task_id_to_rank[int(i * self.num_of_functionality + j)] = i return task_id_to_rank def construct_task_nodes_1f1b_op_list(self, op_list_map): max_slot_times = int(self.max_run_times - self.coord['pp_idx']) cur_start_id = int(self.rank * self.num_of_functionality) lr_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, role=int(OpRole.Optimize.LRSched), ops=op_list_map["lr"], task_id=cur_start_id, node_type="Amplifier") lr_task_node.set_run_pre_steps(self.max_run_times) fwd_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, role=int(OpRole.Forward), ops=op_list_map["fwd"], task_id=cur_start_id + 1, node_type="Compute") bwd_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, role=int(OpRole.Backward), ops=op_list_map["bwd"], task_id=cur_start_id + 2, node_type="Compute") opt_task_node = TaskNode(rank=self.rank, max_run_times=self.max_run_times, max_slot_times=max_slot_times, role=int(OpRole.Optimize), ops=op_list_map["opt"], task_id=cur_start_id + 3, node_type="Amplifier") opt_task_node.set_run_pre_steps(self.max_run_times) opt_task_node.set_run_at_offset(self.max_run_times - 1) return { "lr": lr_task_node, "fwd": fwd_task_node, "bwd": bwd_task_node, "opt": opt_task_node } def run1f1b(program, rank, max_run_times, dist_opt, nrank, with_standalone_executor=False): """ Split the program to support 1f1b pipeline scheduler. This funct will split the program based on the op_role. The program will be split into four parts: lr_sched, fwd, bwd, opt. And will create task nodes based on the four parts of the program. :param program: The origin program. :param rank: Current rank (can be got from fleet.worker_index()). :param max_run_times: Max run times for a micro batch. AKA number of micro steps. :param dist_opt: The fleet_opt configured by user. :param nrank: Number of workers (can be got from fleet.worker_num()). :param with_standalone_executor: Experiment feature, use fleet executor with standalone executor. :return: task_nodes (list): four task nodes for current rank task_id_to_rank (dict): task nodes' ids to it's corresponding rank """ print("fleet executor will use python side 1f1b scheduler.") fleet_executor_utils = FleetExecutorUtils(dist_strategy=dist_opt, rank=rank, nrank=nrank, max_run_times=max_run_times) op_list_map = fleet_executor_utils.split_program_to_op_list(program) task_node_map = None if with_standalone_executor: program_map = fleet_executor_utils.convert_op_list_to_program( op_list_map, program) task_node_map = fleet_executor_utils.construct_task_nodes_1f1b( program_map) else: op_desc_list_map = {"lr": [], "fwd": [], "bwd": [], "opt": []} for key in op_list_map: for op in op_list_map[key]: op_desc_list_map[key].append(op.desc) task_node_map = fleet_executor_utils.construct_task_nodes_1f1b_op_list( op_desc_list_map) task_node_map = fleet_executor_utils.build_1f1b_dependency(task_node_map) task_id_to_rank = fleet_executor_utils.task_id_to_rank() task_node_list = [task_node_map[key].task_node() for key in task_node_map] return task_node_list, task_id_to_rank def origin(program, rank): """ Origin scheduler for fleet executor, supports non-pp mode :param program: The origin program. :param rank: Current rank (can be got from fleet.worker_index()). :return: task_nodes (list): four task nodes for current rank task_id_to_rank (dict): a fake dict, since there is no upstream or downstream, this dict won't be used """ print("fleet executor will use python side origin scheduler.") task_node = TaskNode(program=program, rank=rank, node_type="Compute", max_run_times=1, max_slot_times=1) task_id_to_rank = {task_node.task_id(): rank} return [task_node.task_node()], task_id_to_rank