From d83d59dde30128870ff1a720d04d670511cdfee2 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 14 Jun 2022 16:15:35 +0800 Subject: [PATCH] [cuda graph] partial program with cuda graph under static mode (#43440) --- paddle/fluid/framework/program_desc.cc | 21 +- paddle/fluid/operators/run_program_op.h | 45 ++- .../platform/cuda_graph_with_memory_pool.cc | 10 + python/paddle/device/cuda/graphs.py | 335 ++++++++++++++++++ ...est_cuda_graph_partial_graph_static_run.py | 127 +++++++ 5 files changed, 525 insertions(+), 13 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 88738255af7..f4af3c5eba0 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() { ProgramDesc::ProgramDesc(const ProgramDesc &o) { desc_ = o.desc_; + std::vector old_block_desc; for (int i = 0; i < desc_.blocks_size(); ++i) { auto *block = desc_.mutable_blocks(i); blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); + // record all block desc's ptr from origin program + old_block_desc.emplace_back(o.blocks_[i].get()); } for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) { auto all_ops = blocks_[block_id]->AllOps(); @@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { for (const std::string &attr_name : op->AttrNames()) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { - int sub_block_id = - o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); - op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + framework::BlockDesc *block_desc = + BOOST_GET_CONST(framework::BlockDesc *, op->GetAttr(attr_name)); + if (std::find(old_block_desc.begin(), old_block_desc.end(), + block_desc) != old_block_desc.end()) { + // The block is owned by the origin program. Just use id to get + // the corresponding block. + int sub_block_id = + o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); + op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + } else { + // The block is not owned by the origin program. Should copy + // the real block desc instead of logical block in the program. + VLOG(3) << "Set op's block attr with the original block"; + op->SetBlockAttr(attr_name, block_desc); + } } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) { std::vector sub_block_ids = o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name); diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index bfd33efe833..42e8379bca4 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel { auto input_var_names = ctx.InputNames("X"); auto output_var_names = ctx.OutputNames("Out"); - auto dout_var_names = ctx.OutputNames("DOut"); + std::vector dout_var_names; + if (!dout_vars.empty()) { + // DOut is a dispensable out, only get the names when it exists. + // Otherwise, it will throw a NotFound error. + dout_var_names = ctx.OutputNames("DOut"); + } // current program may not hold parameters std::vector param_names; @@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel { // NOTE(chenweihang): In order not to add new variable type, use vector // here. Originally, here can use scope directly. auto *out_scope_vec = ctx.Output("OutScope"); - PADDLE_ENFORCE_EQ( - out_scope_vec->size(), 1, - platform::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should only hold one scope.")); + std::unique_ptr inner_scope{nullptr}; + if (out_scope_vec->size() == 0) { + // For cuda graph under static mode usage. + // For static mode, we cannot set value of a tensor before any run, + // the OutScope variable passed to the op actually contains nothing. + // Just create a tmp scope to run the program. + PADDLE_ENFORCE_EQ( + use_cuda_graph, true, + platform::errors::InvalidArgument( + "If not provide OutScope then must run under cuda graph mode.")); + inner_scope = std::make_unique(); + } else { + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + } // Step 2. prepare executor and init persistable variables @@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel { // Learning. Tensor data in multi-step training should be saved into single // scope separately. Otherwise, the gradients can be miscalculated because // always using the Tensor data of the last step in forward. - framework::Scope *global_inner_scope = out_scope_vec->front(); + framework::Scope *global_inner_scope = + out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front(); VLOG(2) << "The number of sub scopes before forward: " - << out_scope_vec->front()->kids().size(); + << global_inner_scope->kids().size(); framework::Scope &scope = global_inner_scope->NewScope(); // share input_vars & parameters into scope @@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel { &scope); // Debug info: scope info when run end - VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + framework::Scope *target_scope{nullptr}; + if (out_scope_vec->size() == 0) { + target_scope = inner_scope.get(); + } else { + target_scope = out_scope_vec->front(); + } + VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope); // Step 5. Drop all children scopes while testing. if (is_test) { - out_scope_vec->front()->DropKids(); + target_scope->DropKids(); } VLOG(2) << "The number of sub scopes after forward: " - << out_scope_vec->front()->kids().size(); + << target_scope->kids().size(); #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace()); #endif diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 4ef2a9709a5..9d3d342431b 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + // After PR(#43206), cudnn related initializations will change to lazy mode. + // It will only be initialized when op calls them. But cuda graph not support + // capture such kind of init, need to init all these handle before cuda graph. + dev_ctx->cublas_handle(); +#if CUDA_VERSION >= 11060 + dev_ctx->cublaslt_handle(); +#endif + dev_ctx->cudnn_handle(); + dev_ctx->cusolver_dn_handle(); + auto stream = dev_ctx->stream(); CUDAGraph::BeginCapture(place, stream, mode); diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py index dca32fb6bb8..5c9c8740d85 100644 --- a/python/paddle/device/cuda/graphs.py +++ b/python/paddle/device/cuda/graphs.py @@ -14,7 +14,10 @@ import os import paddle +from paddle.fluid import core +from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace +import warnings if is_compiled_with_cuda() and not is_compiled_with_rocm(): from paddle.fluid.core import CUDAGraph as CoreCUDAGraph @@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"): else: mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id return new_function + + +def copy_var_desc(dst, src): + """ + copy var desc from src to dst + + :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance + :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance + :return: no return + """ + dst.set_shape(src.shape) + dst.set_dtype(src.dtype) + dst.set_lod_level(src.lod_level) + dst.set_type(src.type) + dst.set_persistable(src.persistable) + dst.set_is_parameter(src.is_parameter) + dst.set_stop_gradient(src.stop_gradient) + + +def all_inputs_of_later_op(block, begin_idx): + """ + find all inputs of ops after an idx, used to determine the logical output of a cuda graph section + + :param block: framework.Block, the original block + :param begin_idx: int, from which idx (not include) to find the later ins + :return: a list of inputs names for all ops behind begin_idx + """ + ins = [] + for idx, op in enumerate(block.ops): + if idx <= begin_idx: + continue + for in_name in op.input_arg_names: + ins.append(in_name) + return list(set(ins)) + + +def construct_program_and_find_ins_outs(section, origin_program, section_idx): + """ + 1. Construct a new program for corresponding section + 2. Find all the logical inputs and outputs of a program section + + :param section: list, one cuda graph section, list of ops + :param origin_program: framework.Program, origin program + :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx + :return: a new program for the cuda graph section + the logical ins and outs of the cuda graph section + """ + program = paddle.static.Program() + block = program.global_block() + origin_block = origin_program.global_block() + ins = [] + outs = [] + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + later_ins = all_inputs_of_later_op(origin_block, section_idx[-1]) + + for op in section: + for in_name in op.input_arg_names: + var = origin_block.var(in_name) + new_var_desc = block.desc.var(var.name.encode("ascii")) + copy_var_desc(new_var_desc, var) + if outs.count(in_name) == 0 and ins.count(in_name) == 0: + # This in var is generated from op outside this section + # Only record once for same input + ins.append(in_name) + elif later_ins.count(in_name) == 0: + # this is var is generated from op inside this section, and only will be used inside this section + outs.remove(in_name) + for out_name in op.output_arg_names: + var = origin_block.var(out_name) + new_var_desc = block.desc.var(var.name.encode("ascii")) + copy_var_desc(new_var_desc, var) + # for every output, we add it to the section's outs + if outs.count(out_name) == 0: + # Only record one out var even if it will be generated by multi ops. + # For scenario like this: + # A = op1(a) + # A = op2(b) + # B = op3(A) + outs.append(out_name) + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(op.desc) + new_op_desc._set_attr(op_role_attr_name, op.attr(op_role_attr_name)) + + program._sync_with_cpp() + + return program, [ins, outs] + + +def get_cuda_graph_sections(program): + """ + get all sections that should run under cuda graph and the corresponding idx + + :param program: framework.Program, the original program + :return: A list of cuda graph sections and the corresponding ops' idx in the block. + The program is under is test or not. + """ + block = program.global_block() + cuda_graph_sections = [] # record all ops in every cuda graph sections + sections_idx = [] # idx of all ops in every cuda graph sections + is_test = False # will be set to True is any op's 'is_test' attr is True + + # ops and it's idx between cuda graph wrapped op, may belong to a section + internal_section = [] + internal_idx = [] + + current_section = [] # current recording cuda graph sections + current_idx = [] # current recording cuda graph ops' idx + current_cuda_graph_id = -1 # current recording cuda graph id + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + loss_op_role = int(core.op_proto_and_checker_maker.OpRole.Loss) + backward_op_role = int(core.op_proto_and_checker_maker.OpRole.Backward) + loss_grad_op_role = loss_op_role | backward_op_role + + for idx, op in enumerate(block.ops): + if op.type == 'conditional_block' or op.type == 'while': + assert op._cuda_graph_attr is None, "Cuda graph not support conditional block op and while op." + if op.has_attr('is_test') and op.attr('is_test'): + is_test = True + # find cuda graph sections + if op._cuda_graph_attr is not None: + assert isinstance(op._cuda_graph_attr, + str), "cuda_graph_attr should be a str" + cuda_graph_attrs = op._cuda_graph_attr.split(';') + assert len(cuda_graph_attrs) == 3, "cuda graph attr should have three fields: " \ + "cuda graph mode, cuda graph memory pool id, cuda graph id" + local_cuda_graph_id = int(cuda_graph_attrs[2]) + if local_cuda_graph_id == current_cuda_graph_id: + if len(internal_section) > 0: + assert len(internal_section) == len( + internal_idx + ), "len of internal section should be equal with len of internal idx" + for internal_op in internal_section: + loss_related = (int(internal_op.attr(op_role_attr_name)) + == loss_op_role) or int( + (internal_op.attr(op_role_attr_name) + ) == loss_grad_op_role) + sub_block_related = (op.type == 'conditional_block' + or op.type == 'while') + if loss_related or sub_block_related: + # if loss_related is True + # The internal section contains loss related ops, + # although these ops are between two cuda graph sections with same graph id, + # they belong to none of these two sections. + # The loss related op should be wrapped by user explicitly. + + # if sub_block_related is True + # The internal section contains while op or conditional block op. + # These two ops are not supported by cuda graph. Won't extend the section. + internal_section = [] + internal_idx = [] + # Beside clear the internal section, a new cuda graph section should be recorded + assert len(current_section) == len(current_idx), \ + "num of section's op is not equal with the idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + current_section = [] + current_idx = [] + break + # some ops inserted by some optimizer, should be added to current section + for i in range(len(internal_section)): + current_section.append(internal_section[i]) + current_idx.append(internal_idx[i]) + internal_section = [] + current_section.append(op) + current_idx.append(idx) + else: + # current graph id is different with previous, start a new section of cuda graph + # internal ops and idx belong to no section, just clear it + internal_section = [] + internal_idx = [] + current_cuda_graph_id = local_cuda_graph_id # start record a new section + assert len(current_section) == len( + current_idx + ), "num of section's op is not equal with num of idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + current_section = [op] + current_idx = [idx] + else: + # recode ops which cuda_graph_attr is None, may belong to a section + internal_section.append(op) + internal_idx.append(idx) + + # handle the last section + assert len(current_section) == len( + current_idx), "num of section's op is not equal with num of idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + + return cuda_graph_sections, sections_idx, is_test + + +def replace_cuda_graph_section(ins_and_outs, section_program, section_idx, + origin_program, cuda_graph_section, order, + is_test): + """ + Use section_program and ins_and_outs to initialize a run_program_op, + and replace the section_idx marks ops in the origin program. + + :param ins_and_outs: list, the logical ins and outs of the section program + :param section_program: framework.Program, the partial program need to run under cuda graph + :param section_idx: list, the idx need to be removed from origin program + :param origin_program: framework.Program, the origin program + :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test + :param order: int, the order of current section, used to create unique cuda graph var + :param is_test: bool, the program is running under is_test or not + :return: no return + """ + ins = ins_and_outs[0] + outs = ins_and_outs[1] + insert_idx = section_idx[0] + origin_block = origin_program.global_block() + + for idx in reversed(section_idx): + # remove all cuda graph marked ops from origin block + origin_block._remove_op(idx, sync=False) + + mode = None + memory_pool_id = None + + for op in cuda_graph_section: + # find the cuda graph mode and memory pool id, determine is test or not + if op._cuda_graph_attr is not None: + attrs = op._cuda_graph_attr.split(';') + mode = attrs[0] + memory_pool_id = int(attrs[1]) + break + + assert mode is not None and memory_pool_id is not None, \ + "mode and memory pool id should be specified in cuda graph attr" + + cuda_graph_var = origin_block.create_var( + name="cuda_graph_" + str(order), + type=core.VarDesc.VarType.RAW, + persistable=True, + stop_gradient=True, + ) + + # not used for the run_program_op, just needed by the op, but won't be used + out_scope_var = origin_block.create_var( + name="program_out_scope_" + str(order), + type=core.VarDesc.VarType.STEP_SCOPES, + persistable=True, + stop_gradient=True, + ) + + program_id = _hash_with_id(section_program, ins_and_outs) + + # insert the run_program_op into the block + origin_block._insert_op(insert_idx, + type='run_program', + inputs={'X': ins}, + outputs={ + 'Out': outs, + 'OutScope': out_scope_var, + 'CUDAGraph': cuda_graph_var + }, + attrs={ + 'global_block': + section_program.global_block(), + 'start_op_index': + 0, + 'end_op_index': + len(section_program.global_block().ops), + 'is_test': + is_test, + 'program_id': + program_id, + 'cuda_graph_capture_mode': + mode, + 'cuda_graph_pool_id': + memory_pool_id, + }) + + +def cuda_graph_transform(program): + """ + replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph + + :param program: framework.Program, the program to be transformed + :return: the cuda graph section program, user should hold these programs! + """ + + if len(program.blocks) > 1: + # some sub blocks may be inserted by optimizer but will not use during training, just warn here + warnings.warn( + "Sub block(s) has been detected in the program. " + "Cuda graph not support op with sub block, and it will only handle the global block." + ) + + # step 1: get all cuda graph sections. + # A cuda graph section contains all ops marked with same cuda graph id and + # some ops inserted by some optimizers (amp, sharding for example) between ops with same id. + cuda_graph_sections, sections_idx, is_test = get_cuda_graph_sections( + program) + assert len(cuda_graph_sections) == len(sections_idx), \ + "num of cuda graph sections is not equal with num of idx sections" + + # step 2: construct new program for each section and find inputs and outputs of each section. + # The inputs are variables generated outside the section but will be used by this section. + # The outputs are variables generated by this section and will be used after the end of the section. + ins_and_outs = [] + section_programs = [] + for i in range(len(cuda_graph_sections)): + # creating new program for current section + section_program, ins_outs = construct_program_and_find_ins_outs( + cuda_graph_sections[i], program, sections_idx[i]) + ins_and_outs.append(ins_outs) + section_programs.append(section_program) + assert len(section_programs) == len(cuda_graph_sections), \ + "the num of cuda graph sections should be equal with the num of new program" + + # step 3: replace the ops in original program with run_program_op. + # Will remove all ops in the section from origin program, and use run_program_op to replace them. + for i in reversed(range(len(cuda_graph_sections))): + # carry out the replacement in reversed order, to keep the previous idx intact + replace_cuda_graph_section(ins_and_outs[i], + section_programs[i], + sections_idx[i], + program, + cuda_graph_sections[i], + order=i, + is_test=is_test) + + # NOTE: user should hold these program, for now just return these program back to caller + return section_programs diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py new file mode 100644 index 00000000000..445211d35a1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import unittest +import numpy as np +from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported, cuda_graph_transform + +paddle.enable_static() + + +class SimpleModel(nn.Layer): + + def __init__(self, in_size, out_size): + super(SimpleModel, self).__init__() + self.linear = nn.Linear(in_size, out_size) + self.dropout_1 = paddle.nn.Dropout(0.1) + self.relu = nn.ReLU() + self.dropout_2 = paddle.nn.Dropout(0.5) + self.gelu = nn.GELU() + + def forward(self, x): + x = self.linear(x) + x = self.dropout_1(x) + x = self.relu(x) + x = self.dropout_2(x) + x = self.gelu(x) + return x + + +class TestCudaGraphAttrAll(unittest.TestCase): + + def setUp(self): + paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 0.0}) + + def get_model(self, use_cuda_graph=False): + x = paddle.static.data(shape=[3, 10], dtype='float32', name='x') + + model_start = SimpleModel(10, 20) + if use_cuda_graph: + model_start = wrap_cuda_graph(model_start) + + model_inter = SimpleModel(20, 20) + + model_end = SimpleModel(20, 10) + if use_cuda_graph: + model_end = wrap_cuda_graph(model_end, memory_pool='new') + + start_out = model_start(x) + inter_out = model_inter(start_out) + end_out = model_end(inter_out) + loss = paddle.mean(end_out) + + opt = paddle.optimizer.SGD() + opt.minimize(loss) + + return loss + + def run_with_cuda_graph(self, x_data): + # run with cuda graph + paddle.seed(1024) + + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + + with paddle.static.program_guard(main_prog, start_prog): + loss = self.get_model(use_cuda_graph=True) + + section_programs = cuda_graph_transform(main_prog) + assert len(section_programs) == 4 + + block = main_prog.global_block() + run_program_op_num = 0 + for op in block.ops: + if op.type == 'run_program': + run_program_op_num += 1 + assert run_program_op_num == 4 + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + exe.run(start_prog) + + for i in range(10): + rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss]) + + return rst + + def normal_run(self, x_data): + # run without cuda graph + paddle.seed(1024) + + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + + with paddle.static.program_guard(main_prog, start_prog): + loss = self.get_model() + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + exe.run(start_prog) + + for i in range(10): + rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss]) + + return rst + + def test_static_mode_cuda_graph(self): + if not is_cuda_graph_supported(): + return + x_data = np.random.random((3, 10)).astype('float32') + cuda_graph_rst = self.run_with_cuda_graph(x_data) + normal_run_rst = self.normal_run(x_data) + assert np.array_equal(cuda_graph_rst, normal_run_rst) + + +if __name__ == "__main__": + unittest.main() -- GitLab