[cuda graph] partial program with cuda graph under static mode (#43440)

d83d59dd · Yuang Liu · GitHub · db58dd27 · d83d59dd · d83d59dd
5 changed file
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() {
 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
  desc_ = o.desc_;
+  std::vector<framework::BlockDesc *> old_block_desc;
  for (int i = 0; i < desc_.blocks_size(); ++i) {
    auto *block = desc_.mutable_blocks(i);
    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
+    // record all block desc's ptr from origin program
+    old_block_desc.emplace_back(o.blocks_[i].get());
  }
  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
    auto all_ops = blocks_[block_id]->AllOps();
@@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
      for (const std::string &attr_name : op->AttrNames()) {
        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          framework::BlockDesc *block_desc =
+              BOOST_GET_CONST(framework::BlockDesc *, op->GetAttr(attr_name));
+          if (std::find(old_block_desc.begin(), old_block_desc.end(),
+                        block_desc) != old_block_desc.end()) {
+            // The block is owned by the origin program. Just use id to get
+            // the corresponding block.
            int sub_block_id =
                o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
            op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+          } else {
+            // The block is not owned by the origin program. Should copy
+            // the real block desc instead of logical block in the program.
+            VLOG(3) << "Set op's block attr with the original block";
+            op->SetBlockAttr(attr_name, block_desc);
+          }
        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
          std::vector<int> sub_block_ids =
              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);

--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
    auto input_var_names = ctx.InputNames("X");
    auto output_var_names = ctx.OutputNames("Out");
-    auto dout_var_names = ctx.OutputNames("DOut");
+    std::vector<std::string> dout_var_names;
+    if (!dout_vars.empty()) {
+      // DOut is a dispensable out, only get the names when it exists.
+      // Otherwise, it will throw a NotFound error.
+      dout_var_names = ctx.OutputNames("DOut");
+    }
    // current program may not hold parameters
    std::vector<std::string> param_names;
@@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
    // NOTE(chenweihang): In order not to add new variable type, use vector
    // here. Originally, here can use scope directly.
    auto *out_scope_vec = ctx.Output<StepScopeVar>("OutScope");
+    std::unique_ptr<framework::Scope> inner_scope{nullptr};
+    if (out_scope_vec->size() == 0) {
+      // For cuda graph under static mode usage.
+      // For static mode, we cannot set value of a tensor before any run,
+      // the OutScope variable passed to the op actually contains nothing.
+      // Just create a tmp scope to run the program.
+      PADDLE_ENFORCE_EQ(
+          use_cuda_graph, true,
+          platform::errors::InvalidArgument(
+              "If not provide OutScope then must run under cuda graph mode."));
+      inner_scope = std::make_unique<framework::Scope>();
+    } else {
      PADDLE_ENFORCE_EQ(
          out_scope_vec->size(), 1,
          platform::errors::InvalidArgument(
              "The OutScope of RunProgramGradOp should only hold one scope."));
+    }
    // Step 2. prepare executor and init persistable variables
@@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
    // Learning. Tensor data in multi-step training should be saved into single
    // scope separately. Otherwise, the gradients can be miscalculated because
    // always using the Tensor data of the last step in forward.
-    framework::Scope *global_inner_scope = out_scope_vec->front();
+    framework::Scope *global_inner_scope =
+        out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front();
    VLOG(2) << "The number of sub scopes before forward: "
-            << out_scope_vec->front()->kids().size();
+            << global_inner_scope->kids().size();
    framework::Scope &scope = global_inner_scope->NewScope();
    // share input_vars & parameters into scope
@@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
                                &scope);
    // Debug info: scope info when run end
-    VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+    framework::Scope *target_scope{nullptr};
+    if (out_scope_vec->size() == 0) {
+      target_scope = inner_scope.get();
+    } else {
+      target_scope = out_scope_vec->front();
+    }
+    VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope);
    // Step 5. Drop all children scopes while testing.
    if (is_test) {
-      out_scope_vec->front()->DropKids();
+      target_scope->DropKids();
    }
    VLOG(2) << "The number of sub scopes after forward: "
-            << out_scope_vec->front()->kids().size();
+            << target_scope->kids().size();
 #ifdef PADDLE_WITH_MKLDNN
    if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace());
 #endif

--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+  // After PR(#43206), cudnn related initializations will change to lazy mode.
+  // It will only be initialized when op calls them. But cuda graph not support
+  // capture such kind of init, need to init all these handle before cuda graph.
+  dev_ctx->cublas_handle();
+#if CUDA_VERSION >= 11060
+  dev_ctx->cublaslt_handle();
+#endif
+  dev_ctx->cudnn_handle();
+  dev_ctx->cusolver_dn_handle();
  auto stream = dev_ctx->stream();
  CUDAGraph::BeginCapture(place, stream, mode);

--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -14,7 +14,10 @@
 import os
 import paddle
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
+import warnings
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
    from paddle.fluid.core import CUDAGraph as CoreCUDAGraph
@@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
        else:
            mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id
    return new_function
+def copy_var_desc(dst, src):
+    """
+    copy var desc from src to dst
+    :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance
+    :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance
+    :return: no return
+    """
+    dst.set_shape(src.shape)
+    dst.set_dtype(src.dtype)
+    dst.set_lod_level(src.lod_level)
+    dst.set_type(src.type)
+    dst.set_persistable(src.persistable)
+    dst.set_is_parameter(src.is_parameter)
+    dst.set_stop_gradient(src.stop_gradient)
+def all_inputs_of_later_op(block, begin_idx):
+    """
+    find all inputs of ops after an idx, used to determine the logical output of a cuda graph section
+    :param block: framework.Block, the original block
+    :param begin_idx: int, from which idx (not include) to find the later ins
+    :return: a list of inputs names for all ops behind begin_idx
+    """
+    ins = []
+    for idx, op in enumerate(block.ops):
+        if idx <= begin_idx:
+            continue
+        for in_name in op.input_arg_names:
+            ins.append(in_name)
+    return list(set(ins))
+def construct_program_and_find_ins_outs(section, origin_program, section_idx):
+    """
+    1. Construct a new program for corresponding section
+    2. Find all the logical inputs and outputs of a program section
+    :param section: list, one cuda graph section, list of ops
+    :param origin_program: framework.Program, origin program
+    :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx
+    :return: a new program for the cuda graph section
+             the logical ins and outs of the cuda graph section
+    """
+    program = paddle.static.Program()
+    block = program.global_block()
+    origin_block = origin_program.global_block()
+    ins = []
+    outs = []
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    later_ins = all_inputs_of_later_op(origin_block, section_idx[-1])
+    for op in section:
+        for in_name in op.input_arg_names:
+            var = origin_block.var(in_name)
+            new_var_desc = block.desc.var(var.name.encode("ascii"))
+            copy_var_desc(new_var_desc, var)
+            if outs.count(in_name) == 0 and ins.count(in_name) == 0:
+                # This in var is generated from op outside this section
+                # Only record once for same input
+                ins.append(in_name)
+            elif later_ins.count(in_name) == 0:
+                # this is var is generated from op inside this section, and only will be used inside this section
+                outs.remove(in_name)
+        for out_name in op.output_arg_names:
+            var = origin_block.var(out_name)
+            new_var_desc = block.desc.var(var.name.encode("ascii"))
+            copy_var_desc(new_var_desc, var)
+            # for every output, we add it to the section's outs
+            if outs.count(out_name) == 0:
+                # Only record one out var even if it will be generated by multi ops.
+                # For scenario like this:
+                # A = op1(a)
+                # A = op2(b)
+                # B = op3(A)
+                outs.append(out_name)
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(op.desc)
+        new_op_desc._set_attr(op_role_attr_name, op.attr(op_role_attr_name))
+    program._sync_with_cpp()
+    return program, [ins, outs]
+def get_cuda_graph_sections(program):
+    """
+    get all sections that should run under cuda graph and the corresponding idx
+    :param program: framework.Program, the original program
+    :return: A list of cuda graph sections and the corresponding ops' idx in the block.
+             The program is under is test or not.
+    """
+    block = program.global_block()
+    cuda_graph_sections = []  # record all ops in every cuda graph sections
+    sections_idx = []  # idx of all ops in every cuda graph sections
+    is_test = False  # will be set to True is any op's 'is_test' attr is True
+    # ops and it's idx between cuda graph wrapped op, may belong to a section
+    internal_section = []
+    internal_idx = []
+    current_section = []  # current recording cuda graph sections
+    current_idx = []  # current recording cuda graph ops' idx
+    current_cuda_graph_id = -1  # current recording cuda graph id
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    loss_op_role = int(core.op_proto_and_checker_maker.OpRole.Loss)
+    backward_op_role = int(core.op_proto_and_checker_maker.OpRole.Backward)
+    loss_grad_op_role = loss_op_role | backward_op_role
+    for idx, op in enumerate(block.ops):
+        if op.type == 'conditional_block' or op.type == 'while':
+            assert op._cuda_graph_attr is None, "Cuda graph not support conditional block op and while op."
+        if op.has_attr('is_test') and op.attr('is_test'):
+            is_test = True
+        # find cuda graph sections
+        if op._cuda_graph_attr is not None:
+            assert isinstance(op._cuda_graph_attr,
+                              str), "cuda_graph_attr should be a str"
+            cuda_graph_attrs = op._cuda_graph_attr.split(';')
+            assert len(cuda_graph_attrs) == 3, "cuda graph attr should have three fields: " \
+                                               "cuda graph mode, cuda graph memory pool id, cuda graph id"
+            local_cuda_graph_id = int(cuda_graph_attrs[2])
+            if local_cuda_graph_id == current_cuda_graph_id:
+                if len(internal_section) > 0:
+                    assert len(internal_section) == len(
+                        internal_idx
+                    ), "len of internal section should be equal with len of internal idx"
+                    for internal_op in internal_section:
+                        loss_related = (int(internal_op.attr(op_role_attr_name))
+                                        == loss_op_role) or int(
+                                            (internal_op.attr(op_role_attr_name)
+                                             ) == loss_grad_op_role)
+                        sub_block_related = (op.type == 'conditional_block'
+                                             or op.type == 'while')
+                        if loss_related or sub_block_related:
+                            # if loss_related is True
+                            # The internal section contains loss related ops,
+                            # although these ops are between two cuda graph sections with same graph id,
+                            # they belong to none of these two sections.
+                            # The loss related op should be wrapped by user explicitly.
+                            # if sub_block_related is True
+                            # The internal section contains while op or conditional block op.
+                            # These two ops are not supported by cuda graph. Won't extend the section.
+                            internal_section = []
+                            internal_idx = []
+                            # Beside clear the internal section, a new cuda graph section should be recorded
+                            assert len(current_section) == len(current_idx), \
+                                "num of section's op is not equal with the idx"
+                            if len(current_section) > 0:
+                                # store previous section
+                                cuda_graph_sections.append(current_section)
+                                sections_idx.append(current_idx)
+                            current_section = []
+                            current_idx = []
+                            break
+                    # some ops inserted by some optimizer, should be added to current section
+                    for i in range(len(internal_section)):
+                        current_section.append(internal_section[i])
+                        current_idx.append(internal_idx[i])
+                internal_section = []
+                current_section.append(op)
+                current_idx.append(idx)
+            else:
+                # current graph id is different with previous, start a new section of cuda graph
+                # internal ops and idx belong to no section, just clear it
+                internal_section = []
+                internal_idx = []
+                current_cuda_graph_id = local_cuda_graph_id  # start record a new section
+                assert len(current_section) == len(
+                    current_idx
+                ), "num of section's op is not equal with num of idx"
+                if len(current_section) > 0:
+                    # store previous section
+                    cuda_graph_sections.append(current_section)
+                    sections_idx.append(current_idx)
+                current_section = [op]
+                current_idx = [idx]
+        else:
+            # recode ops which cuda_graph_attr is None, may belong to a section
+            internal_section.append(op)
+            internal_idx.append(idx)
+    # handle the last section
+    assert len(current_section) == len(
+        current_idx), "num of section's op is not equal with num of idx"
+    if len(current_section) > 0:
+        # store previous section
+        cuda_graph_sections.append(current_section)
+        sections_idx.append(current_idx)
+    return cuda_graph_sections, sections_idx, is_test
+def replace_cuda_graph_section(ins_and_outs, section_program, section_idx,
+                               origin_program, cuda_graph_section, order,
+                               is_test):
+    """
+    Use section_program and ins_and_outs to initialize a run_program_op,
+    and replace the section_idx marks ops in the origin program.
+    :param ins_and_outs: list, the logical ins and outs of the section program
+    :param section_program: framework.Program, the partial program need to run under cuda graph
+    :param section_idx: list, the idx need to be removed from origin program
+    :param origin_program: framework.Program, the origin program
+    :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test
+    :param order: int, the order of current section, used to create unique cuda graph var
+    :param is_test: bool, the program is running under is_test or not
+    :return: no return
+    """
+    ins = ins_and_outs[0]
+    outs = ins_and_outs[1]
+    insert_idx = section_idx[0]
+    origin_block = origin_program.global_block()
+    for idx in reversed(section_idx):
+        # remove all cuda graph marked ops from origin block
+        origin_block._remove_op(idx, sync=False)
+    mode = None
+    memory_pool_id = None
+    for op in cuda_graph_section:
+        # find the cuda graph mode and memory pool id, determine is test or not
+        if op._cuda_graph_attr is not None:
+            attrs = op._cuda_graph_attr.split(';')
+            mode = attrs[0]
+            memory_pool_id = int(attrs[1])
+            break
+    assert mode is not None and memory_pool_id is not None, \
+        "mode and memory pool id should be specified in cuda graph attr"
+    cuda_graph_var = origin_block.create_var(
+        name="cuda_graph_" + str(order),
+        type=core.VarDesc.VarType.RAW,
+        persistable=True,
+        stop_gradient=True,
+    )
+    # not used for the run_program_op, just needed by the op, but won't be used
+    out_scope_var = origin_block.create_var(
+        name="program_out_scope_" + str(order),
+        type=core.VarDesc.VarType.STEP_SCOPES,
+        persistable=True,
+        stop_gradient=True,
+    )
+    program_id = _hash_with_id(section_program, ins_and_outs)
+    # insert the run_program_op into the block
+    origin_block._insert_op(insert_idx,
+                            type='run_program',
+                            inputs={'X': ins},
+                            outputs={
+                                'Out': outs,
+                                'OutScope': out_scope_var,
+                                'CUDAGraph': cuda_graph_var
+                            },
+                            attrs={
+                                'global_block':
+                                section_program.global_block(),
+                                'start_op_index':
+                                0,
+                                'end_op_index':
+                                len(section_program.global_block().ops),
+                                'is_test':
+                                is_test,
+                                'program_id':
+                                program_id,
+                                'cuda_graph_capture_mode':
+                                mode,
+                                'cuda_graph_pool_id':
+                                memory_pool_id,
+                            })
+def cuda_graph_transform(program):
+    """
+    replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph
+    :param program: framework.Program, the program to be transformed
+    :return: the cuda graph section program, user should hold these programs!
+    """
+    if len(program.blocks) > 1:
+        # some sub blocks may be inserted by optimizer but will not use during training, just warn here
+        warnings.warn(
+            "Sub block(s) has been detected in the program. "
+            "Cuda graph not support op with sub block, and it will only handle the global block."
+        )
+    # step 1: get all cuda graph sections.
+    # A cuda graph section contains all ops marked with same cuda graph id and
+    # some ops inserted by some optimizers (amp, sharding for example) between ops with same id.
+    cuda_graph_sections, sections_idx, is_test = get_cuda_graph_sections(
+        program)
+    assert len(cuda_graph_sections) == len(sections_idx), \
+        "num of cuda graph sections is not equal with num of idx sections"
+    # step 2: construct new program for each section and find inputs and outputs of each section.
+    # The inputs are variables generated outside the section but will be used by this section.
+    # The outputs are variables generated by this section and will be used after the end of the section.
+    ins_and_outs = []
+    section_programs = []
+    for i in range(len(cuda_graph_sections)):
+        # creating new program for current section
+        section_program, ins_outs = construct_program_and_find_ins_outs(
+            cuda_graph_sections[i], program, sections_idx[i])
+        ins_and_outs.append(ins_outs)
+        section_programs.append(section_program)
+    assert len(section_programs) == len(cuda_graph_sections), \
+        "the num of cuda graph sections should be equal with the num of new program"
+    # step 3: replace the ops in original program with run_program_op.
+    # Will remove all ops in the section from origin program, and use run_program_op to replace them.
+    for i in reversed(range(len(cuda_graph_sections))):
+        # carry out the replacement in reversed order, to keep the previous idx intact
+        replace_cuda_graph_section(ins_and_outs[i],
+                                   section_programs[i],
+                                   sections_idx[i],
+                                   program,
+                                   cuda_graph_sections[i],
+                                   order=i,
+                                   is_test=is_test)
+    # NOTE: user should hold these program, for now just return these program back to caller
+    return section_programs
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import unittest
+import numpy as np
+from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported, cuda_graph_transform
+paddle.enable_static()
+class SimpleModel(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleModel, self).__init__()
+        self.linear = nn.Linear(in_size, out_size)
+        self.dropout_1 = paddle.nn.Dropout(0.1)
+        self.relu = nn.ReLU()
+        self.dropout_2 = paddle.nn.Dropout(0.5)
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.dropout_1(x)
+        x = self.relu(x)
+        x = self.dropout_2(x)
+        x = self.gelu(x)
+        return x
+class TestCudaGraphAttrAll(unittest.TestCase):
+    def setUp(self):
+        paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 0.0})
+    def get_model(self, use_cuda_graph=False):
+        x = paddle.static.data(shape=[3, 10], dtype='float32', name='x')
+        model_start = SimpleModel(10, 20)
+        if use_cuda_graph:
+            model_start = wrap_cuda_graph(model_start)
+        model_inter = SimpleModel(20, 20)
+        model_end = SimpleModel(20, 10)
+        if use_cuda_graph:
+            model_end = wrap_cuda_graph(model_end, memory_pool='new')
+        start_out = model_start(x)
+        inter_out = model_inter(start_out)
+        end_out = model_end(inter_out)
+        loss = paddle.mean(end_out)
+        opt = paddle.optimizer.SGD()
+        opt.minimize(loss)
+        return loss
+    def run_with_cuda_graph(self, x_data):
+        # run with cuda graph
+        paddle.seed(1024)
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            loss = self.get_model(use_cuda_graph=True)
+        section_programs = cuda_graph_transform(main_prog)
+        assert len(section_programs) == 4
+        block = main_prog.global_block()
+        run_program_op_num = 0
+        for op in block.ops:
+            if op.type == 'run_program':
+                run_program_op_num += 1
+        assert run_program_op_num == 4
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        exe.run(start_prog)
+        for i in range(10):
+            rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss])
+        return rst
+    def normal_run(self, x_data):
+        # run without cuda graph
+        paddle.seed(1024)
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            loss = self.get_model()
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        exe.run(start_prog)
+        for i in range(10):
+            rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss])
+        return rst
+    def test_static_mode_cuda_graph(self):
+        if not is_cuda_graph_supported():
+            return
+        x_data = np.random.random((3, 10)).astype('float32')
+        cuda_graph_rst = self.run_with_cuda_graph(x_data)
+        normal_run_rst = self.normal_run(x_data)
+        assert np.array_equal(cuda_graph_rst, normal_run_rst)
+if __name__ == "__main__":
+    unittest.main()