未验证 提交 e36f80c6 编写于 作者: C Chitsing KUI 提交者: GitHub

[Fused] controlled randomness for fused dropout add (#52903)

* add random control for fused dropout add

* add __init__
上级 d19d2486
......@@ -5,7 +5,7 @@
# otherwise the operator only could be used in static mode.
- backward_op : fused_dropout_add_grad
forward : fused_dropout_add (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
forward : fused_dropout_add (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(seed_offset)
args : (Tensor seed_offset, Tensor out_grad, Scalar p, bool is_test, str mode, bool fix_seed)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
......
......@@ -34,10 +34,12 @@
optional : bias, x_max
- op : fused_dropout_add
args : (Tensor x, Tensor y, Scalar p, bool is_test, str mode, int seed, bool fix_seed)
args : (Tensor x, Tensor y, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed = 0, bool fix_seed = false)
optional : seed_tensor
output : Tensor(out), Tensor(seed_offset)
infer_meta :
func : FusedDropoutAddInferMeta
param : [x, y]
kernel :
func : fused_dropout_add
data_type : x
......
......@@ -1282,11 +1282,6 @@ void FillDiagonalTensorInferMeta(const MetaTensor& x,
void FusedDropoutAddInferMeta(const MetaTensor& x,
const MetaTensor& y,
const Scalar& p,
bool is_test,
const std::string& mode,
int seed,
bool fix_seed,
MetaTensor* out,
MetaTensor* seed_offset) {
out->share_meta(x);
......
......@@ -224,11 +224,6 @@ void FillDiagonalTensorInferMeta(const MetaTensor& x,
void FusedDropoutAddInferMeta(const MetaTensor& x,
const MetaTensor& y,
const Scalar& p,
bool is_test,
const std::string& mode,
int seed,
bool fix_seed,
MetaTensor* out,
MetaTensor* seed_offset);
......
......@@ -408,6 +408,8 @@ void DropoutFwGPUKernelDriver(
main_offset);
#undef PD_DROPOUT_KERNEL_NAME
}
VLOG(4) << "Dropout seed: " << seed << ", offset: " << offset
<< ", seed_data:" << seed_data;
} else {
if (upscale_in_train) {
// y = x
......
......@@ -139,6 +139,7 @@ template <typename T, typename Context>
void FusedDropoutAddKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const paddle::optional<DenseTensor>& seed_tensor,
const Scalar& p,
bool is_test,
const std::string& mode,
......@@ -168,11 +169,19 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
size_t block_size = random_prop[1];
size_t offset = random_prop[2];
size_t main_offset = random_prop[3];
funcs::GetSeedDataAndIncrement(
dev_ctx, nullptr, fix_seed, seed, offset, &seed_data, &increment);
funcs::GetSeedDataAndIncrement(dev_ctx,
seed_tensor.get_ptr(),
fix_seed,
seed,
offset,
&seed_data,
&increment);
seed_offset_data[0] = static_cast<int64_t>(seed_data);
seed_offset_data[1] = static_cast<int64_t>(increment);
VLOG(4) << "FusedDropoutAdd seed: " << seed << ", offset: " << offset
<< ", seed_data:" << seed_data;
auto dst_functor =
NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
......
......@@ -32,6 +32,7 @@ from . import dist_pnorm
from . import dist_slice
from . import dist_fused_feedforward
from . import dist_fused_attention
from . import dist_fused_dropout_add
from . import dist_reduce_sum_p
from . import dist_shape
from . import dist_assign
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import logging
import paddle
from paddle.framework import core
from paddle.utils import unique_name
from ...utils.log_utils import get_logger
_logger = get_logger(logging.INFO)
from ..random import determinate_rng, is_enable_auto_rand_ctrl
from ..utils import (
naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
set_var_dist_attr,
)
from .common import (
DistributedOperatorImplContainer,
register_distributed_operator_impl,
register_distributed_operator_impl_container,
)
from .dist_eltwise import DistributedDefaultImpl0, DistributedElementwiseImpl0
class DistributedDropout(DistributedOperatorImplContainer):
def __init__(self, op_type):
super().__init__(op_type)
register_distributed_operator_impl_container(
DistributedDropout("fused_dropout_add")
)
# Dist Dropout with Random Control
# Dropout re-use the compatible and cost function of elementwise
class DistributedDropoutImpl0(DistributedElementwiseImpl0):
def __init__(self, name):
super().__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
return True
def is_output_compatible(self, dist_op):
return True
def is_auto_compatible(self, dist_op):
return True
@staticmethod
def forward(ctx, *args, **kwargs):
dist_op_context = ctx.dist_op_context
main_block = dist_op_context.work_block
startup_block = dist_op_context.startup_block
src_op = dist_op_context.cur_src_op
rank_id = dist_op_context.rank_id
op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
assert (
op_dist_attr is not None
), f"forward op [{str(src_op)}] don't have dist attribute !"
assert 'seed_tensor' in kwargs, "input [{}] is not given".format(
'seed_tensor'
)
if (
src_op.has_attr("fix_seed")
and src_op.attr("fix_seed")
and src_op.has_attr("seed")
and src_op.attr("seed")
):
_logger.info(
"Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
src_op
)
)
elif rank_id not in op_dist_attr.process_mesh.process_ids:
pass
elif (
len(kwargs['seed_tensor']) > 0
or len(src_op.input("seed_tensor")) > 0
):
seed_var_name = kwargs['seed_tensor'][0]
if seed_var_name.startswith('rc_seed'):
pre_op = main_block.ops[-1]
assert (
pre_op.type == "seed"
and len(pre_op.attr("rng_name")) == 0
), f"found exception op {str(pre_op)}"
# determinate rng
X_var = main_block._var_recursive(kwargs['x'][0])
X_dims_mapping = op_dist_attr.get_input_dims_mapping(
X_var.name
)
process_mesh = op_dist_attr.process_mesh
rng_name = determinate_rng(
rank_id, X_dims_mapping, process_mesh
)
# make recompute seed under control
pre_op._set_attr("rng_name", rng_name)
pre_op._set_attr("deterministic", True)
pre_op._set_attr("force_cpu", True)
else:
_logger.info(
"Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
src_op
)
)
else:
# determinate rng
X_var = main_block._var_recursive(kwargs['x'][0])
X_dims_mapping = op_dist_attr.get_input_dims_mapping(X_var.name)
process_mesh = op_dist_attr.process_mesh
rng_name = determinate_rng(
rank_id, X_dims_mapping, process_mesh
)
assert rng_name is not None and rng_name != ""
# insert seed op
seed_var = main_block.create_var(
name=unique_name.generate_with_ignorable_key(
".".join(["tensor_parallel_seed", 'tmp'])
),
dtype=paddle.int32,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=False,
)
# set new seed_var's dist_attr
seed_var_dims_mapping = [-1]
seed_var_dist_attr = set_var_dist_attr(
ctx, seed_var, seed_var_dims_mapping, process_mesh
)
# adopt for recompute
# force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
seed_op = main_block.append_op(
type='seed',
outputs={'Out': seed_var},
attrs={
'deterministic': True,
'rng_name': rng_name,
'force_cpu': True,
},
)
seed_op._set_attr('op_namescope', 'auto_tensor_parallel_seed')
# set new seed op's dist_attr
naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
seed_op, process_mesh, seed_var_dims_mapping, ctx
)
# modify dropout op
src_op.desc.set_input("seed_tensor", [seed_var.name])
src_op._remove_attr("fix_seed")
src_op._remove_attr("seed")
op_dist_attr.set_input_dist_attr(
seed_var.name, seed_var_dist_attr
)
kwargs['seed_tensor'] = [seed_var.name]
DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
@staticmethod
def backward(ctx, *args, **kwargs):
# dropout backward is deterministic by mask, and not need for random state control
DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
register_distributed_operator_impl(
"fused_dropout_add", DistributedDropoutImpl0("random_control")
)
......@@ -115,7 +115,7 @@ class RecomputeState(ProgramStats):
a seed op before it to guarantee that two dropout op have the same outputs.
"""
op_types = [op.type for op in self.ops]
if "dropout" not in op_types:
if "dropout" not in op_types and "fused_dropout_add" not in op_types:
return
op_idx = 0
......@@ -127,10 +127,15 @@ class RecomputeState(ProgramStats):
self._reserved_vars.extend(cur_op.output_arg_names)
op_idx += 1
continue
if cur_op.type != "dropout":
if cur_op.type not in ["dropout", "fused_dropout_add"]:
op_idx += 1
continue
if cur_op.input("Seed") is not None and len(cur_op.input("Seed")):
seed_tensor_name = (
"seed_tensor" if cur_op.type == "fused_dropout_add" else "Seed"
)
if cur_op.input(seed_tensor_name) is not None and len(
cur_op.input(seed_tensor_name)
):
op_idx += 1
continue
......@@ -179,7 +184,7 @@ class RecomputeState(ProgramStats):
# modify dropout op's desc
self.ops.insert(op_idx, seed_op)
cur_op.desc.set_input("Seed", [var_unique_name])
cur_op.desc.set_input(seed_tensor_name, [var_unique_name])
cur_op._remove_attr("fix_seed")
cur_op._remove_attr("seed")
cur_op_dist_attr.set_input_dist_attr(
......
......@@ -79,6 +79,7 @@ def fused_dropout_add(
out, seed_offset = _C_ops.fused_dropout_add(
x,
y,
None,
p,
not training,
mode,
......@@ -109,7 +110,7 @@ def fused_dropout_add(
helper.append_op(
type='fused_dropout_add',
inputs={'x': x, 'y': y},
inputs={'x': x, 'y': y, 'seed_tensor': None},
outputs={'out': [out], 'seed_offset': [seed_offset]},
attrs=attrs,
)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册