[Fused] controlled randomness for fused dropout add (#52903)

* add random control for fused dropout add * add __init__

[Fused] controlled randomness for fused dropout add (#52903)
* add random control for fused dropout add * add __init__
e36f80c6 · Chitsing KUI · GitHub · d19d2486 · e36f80c6 · e36f80c6
10 changed file
--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
@@ -5,7 +5,7 @@
 # otherwise the operator only could be used in static mode.
 - backward_op : fused_dropout_add_grad
-  forward : fused_dropout_add (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
+  forward : fused_dropout_add (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
  args : (Tensor seed_offset, Tensor out_grad, Scalar p, bool is_test, str mode, bool fix_seed)
  output : Tensor(x_grad), Tensor(y_grad)
  infer_meta :

--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -34,10 +34,12 @@
  optional : bias, x_max
 - op : fused_dropout_add
-  args : (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed)
+  args : (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed = 0, bool fix_seed = false)
+  optional : seed_tensor
  output : Tensor(out), Tensor(seed_offset)
  infer_meta :
    func : FusedDropoutAddInferMeta
+    param : [x, y]
  kernel :
    func : fused_dropout_add
    data_type : x

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1282,11 +1282,6 @@ void FillDiagonalTensorInferMeta(const MetaTensor& x,
 void FusedDropoutAddInferMeta(const MetaTensor& x,
                              const MetaTensor& y,
-                              const Scalar& p,
-                              bool is_test,
-                              const std::string& mode,
-                              int seed,
-                              bool fix_seed,
                              MetaTensor* out,
                              MetaTensor* seed_offset) {
  out->share_meta(x);

--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -224,11 +224,6 @@ void FillDiagonalTensorInferMeta(const MetaTensor& x,
 void FusedDropoutAddInferMeta(const MetaTensor& x,
                              const MetaTensor& y,
-                              const Scalar& p,
-                              bool is_test,
-                              const std::string& mode,
-                              int seed,
-                              bool fix_seed,
                              MetaTensor* out,
                              MetaTensor* seed_offset);

--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -408,6 +408,8 @@ void DropoutFwGPUKernelDriver(
                                         main_offset);
 #undef PD_DROPOUT_KERNEL_NAME
    }
+    VLOG(4) << "Dropout seed: " << seed << ", offset: " << offset
+            << ", seed_data:" << seed_data;
  } else {
    if (upscale_in_train) {
      // y = x

--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -139,6 +139,7 @@ template <typename T, typename Context>
 void FusedDropoutAddKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& y,
+                           const paddle::optional<DenseTensor>& seed_tensor,
                           const Scalar& p,
                           bool is_test,
                           const std::string& mode,
@@ -168,11 +169,19 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
    size_t block_size = random_prop[1];
    size_t offset = random_prop[2];
    size_t main_offset = random_prop[3];
-    funcs::GetSeedDataAndIncrement(
+    funcs::GetSeedDataAndIncrement(dev_ctx,
-        dev_ctx, nullptr, fix_seed, seed, offset, &seed_data, &increment);
+                                   seed_tensor.get_ptr(),
+                                   fix_seed,
+                                   seed,
+                                   offset,
+                                   &seed_data,
+                                   &increment);
    seed_offset_data[0] = static_cast<int64_t>(seed_data);
    seed_offset_data[1] = static_cast<int64_t>(increment);
+    VLOG(4) << "FusedDropoutAdd seed: " << seed << ", offset: " << offset
+            << ", seed_data:" << seed_data;
    auto dst_functor =
        NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);

--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -32,6 +32,7 @@ from . import dist_pnorm
 from . import dist_slice
 from . import dist_fused_feedforward
 from . import dist_fused_attention
+from . import dist_fused_dropout_add
 from . import dist_reduce_sum_p
 from . import dist_shape
 from . import dist_assign

--- a/python/paddle/distributed/auto_parallel/operators/dist_fused_dropout_add.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_dropout_add.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import logging
+import paddle
+from paddle.framework import core
+from paddle.utils import unique_name
+from ...utils.log_utils import get_logger
+_logger = get_logger(logging.INFO)
+from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from ..utils import (
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
+    set_var_dist_attr,
+)
+from .common import (
+    DistributedOperatorImplContainer,
+    register_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+)
+from .dist_eltwise import DistributedDefaultImpl0, DistributedElementwiseImpl0
+class DistributedDropout(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+register_distributed_operator_impl_container(
+    DistributedDropout("fused_dropout_add")
+)
+# Dist Dropout with Random Control
+# Dropout re-use the compatible and cost function of elementwise
+class DistributedDropoutImpl0(DistributedElementwiseImpl0):
+    def __init__(self, name):
+        super().__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+    def is_input_compatible(self, dist_op):
+        return True
+    def is_output_compatible(self, dist_op):
+        return True
+    def is_auto_compatible(self, dist_op):
+        return True
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
+            assert (
+                op_dist_attr is not None
+            ), f"forward op [{str(src_op)}] don't have dist attribute !"
+            assert 'seed_tensor' in kwargs, "input [{}] is not given".format(
+                'seed_tensor'
+            )
+            if (
+                src_op.has_attr("fix_seed")
+                and src_op.attr("fix_seed")
+                and src_op.has_attr("seed")
+                and src_op.attr("seed")
+            ):
+                _logger.info(
+                    "Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
+                        src_op
+                    )
+                )
+            elif rank_id not in op_dist_attr.process_mesh.process_ids:
+                pass
+            elif (
+                len(kwargs['seed_tensor']) > 0
+                or len(src_op.input("seed_tensor")) > 0
+            ):
+                seed_var_name = kwargs['seed_tensor'][0]
+                if seed_var_name.startswith('rc_seed'):
+                    pre_op = main_block.ops[-1]
+                    assert (
+                        pre_op.type == "seed"
+                        and len(pre_op.attr("rng_name")) == 0
+                    ), f"found exception op {str(pre_op)}"
+                    # determinate rng
+                    X_var = main_block._var_recursive(kwargs['x'][0])
+                    X_dims_mapping = op_dist_attr.get_input_dims_mapping(
+                        X_var.name
+                    )
+                    process_mesh = op_dist_attr.process_mesh
+                    rng_name = determinate_rng(
+                        rank_id, X_dims_mapping, process_mesh
+                    )
+                    # make recompute seed under control
+                    pre_op._set_attr("rng_name", rng_name)
+                    pre_op._set_attr("deterministic", True)
+                    pre_op._set_attr("force_cpu", True)
+                else:
+                    _logger.info(
+                        "Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
+                            src_op
+                        )
+                    )
+            else:
+                # determinate rng
+                X_var = main_block._var_recursive(kwargs['x'][0])
+                X_dims_mapping = op_dist_attr.get_input_dims_mapping(X_var.name)
+                process_mesh = op_dist_attr.process_mesh
+                rng_name = determinate_rng(
+                    rank_id, X_dims_mapping, process_mesh
+                )
+                assert rng_name is not None and rng_name != ""
+                # insert seed op
+                seed_var = main_block.create_var(
+                    name=unique_name.generate_with_ignorable_key(
+                        ".".join(["tensor_parallel_seed", 'tmp'])
+                    ),
+                    dtype=paddle.int32,
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    persistable=False,
+                    stop_gradient=False,
+                )
+                # set new seed_var's dist_attr
+                seed_var_dims_mapping = [-1]
+                seed_var_dist_attr = set_var_dist_attr(
+                    ctx, seed_var, seed_var_dims_mapping, process_mesh
+                )
+                # adopt for recompute
+                # force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
+                seed_op = main_block.append_op(
+                    type='seed',
+                    outputs={'Out': seed_var},
+                    attrs={
+                        'deterministic': True,
+                        'rng_name': rng_name,
+                        'force_cpu': True,
+                    },
+                )
+                seed_op._set_attr('op_namescope', 'auto_tensor_parallel_seed')
+                # set new seed op's dist_attr
+                naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                    seed_op, process_mesh, seed_var_dims_mapping, ctx
+                )
+                # modify dropout op
+                src_op.desc.set_input("seed_tensor", [seed_var.name])
+                src_op._remove_attr("fix_seed")
+                src_op._remove_attr("seed")
+                op_dist_attr.set_input_dist_attr(
+                    seed_var.name, seed_var_dist_attr
+                )
+                kwargs['seed_tensor'] = [seed_var.name]
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        # dropout backward is deterministic by mask, and not need for random state control
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+register_distributed_operator_impl(
+    "fused_dropout_add", DistributedDropoutImpl0("random_control")
+)
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -115,7 +115,7 @@ class RecomputeState(ProgramStats):
        a seed op before it to guarantee that two dropout op have the same outputs.
        """
        op_types = [op.type for op in self.ops]
-        if "dropout" not in op_types:
+        if "dropout" not in op_types and "fused_dropout_add" not in op_types:
            return
        op_idx = 0
@@ -127,10 +127,15 @@ class RecomputeState(ProgramStats):
                self._reserved_vars.extend(cur_op.output_arg_names)
                op_idx += 1
                continue
-            if cur_op.type != "dropout":
+            if cur_op.type not in ["dropout", "fused_dropout_add"]:
                op_idx += 1
                continue
-            if cur_op.input("Seed") is not None and len(cur_op.input("Seed")):
+            seed_tensor_name = (
+                "seed_tensor" if cur_op.type == "fused_dropout_add" else "Seed"
+            )
+            if cur_op.input(seed_tensor_name) is not None and len(
+                cur_op.input(seed_tensor_name)
+            ):
                op_idx += 1
                continue
@@ -179,7 +184,7 @@ class RecomputeState(ProgramStats):
            # modify dropout op's desc
            self.ops.insert(op_idx, seed_op)
-            cur_op.desc.set_input("Seed", [var_unique_name])
+            cur_op.desc.set_input(seed_tensor_name, [var_unique_name])
            cur_op._remove_attr("fix_seed")
            cur_op._remove_attr("seed")
            cur_op_dist_attr.set_input_dist_attr(

--- a/python/paddle/incubate/nn/functional/fused_dropout_add.py
+++ b/python/paddle/incubate/nn/functional/fused_dropout_add.py
@@ -79,6 +79,7 @@ def fused_dropout_add(
        out, seed_offset = _C_ops.fused_dropout_add(
            x,
            y,
+            None,
            p,
            not training,
            mode,
@@ -109,7 +110,7 @@ def fused_dropout_add(
        helper.append_op(
            type='fused_dropout_add',
-            inputs={'x': x, 'y': y},
+            inputs={'x': x, 'y': y, 'seed_tensor': None},
            outputs={'out': [out], 'seed_offset': [seed_offset]},
            attrs=attrs,
        )