未验证 提交 4dbe441c 编写于 作者: C cyberslack_lee 提交者: GitHub

[xdoctest] reformat example code with google style in No. 250-260 (#56541)

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* fix

* test=docs_preview

* test=docs_preview

* fix

* move stmts under imports

---------
Co-authored-by: NSigureMo <sigure.qaq@gmail.com>
上级 7314cf69
...@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm ...@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm
class ClipGradForMOEByGlobalNorm(ClipGradBase): class ClipGradForMOEByGlobalNorm(ClipGradBase):
r""" r"""
The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` . :math:`t\_list` , and limit it to ``clip_norm`` .
...@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
Note: Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
Reference: Reference:
https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
...@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``. group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10, >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True), ... weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False)) ... bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x) >>> out = linear(x)
loss = paddle.mean(out) >>> loss = paddle.mean(out)
loss.backward() >>> loss.backward()
is_expert_func = lambda param: "expert_" in param.name >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
clip = paddle.nn.ClipGradForMOEByGlobalNorm(clip_norm=1.0,is_expert_func, None) >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) >>> sdg.step()
sdg.step()
""" """
def __init__( def __init__(
...@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
else: else:
sum_square_list.append(sum_square) sum_square_list.append(sum_square)
# all parameters have been filterd out # all parameters have been filtered out
if ( if (
len(sum_square_list) len(sum_square_list)
+ len(sum_square_list_fp16) + len(sum_square_list_fp16)
......
...@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group): ...@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
class MoELayer(nn.Layer): class MoELayer(nn.Layer):
"""MoE Layer """MoE Layer
Args: Args:
d_model: (int) model dimention d_model (int): Model dimention.
experts: (nn.LayerList) expert networks list experts (nn.LayerList): Expert networks list.
gate: (dict|NaiveGate|SwitchGate|NaiveGate): gate (dict|NaiveGate|SwitchGate|NaiveGate):
if gate is a dict:
gate is a gate network config, containing 2 keys: - If gate is a dict:
`type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard" gate is a gate network config, containing 2 keys:
`top_k`(int) default value is 2 `type` (str) value can be: "naive", "gshard", "switch" or None, default is "gshard".
else gate is an instance of NaiveGate|SwitchGate|NaiveGate: `top_k` (int) Default value is 2.
else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
moe_group: moe group for experts communication
mp_group: mp group for mp commutication moe_group: moe group for experts communication.
recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute. mp_group: mp group for mp communication.
recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given. recompute_interval (int, optional): Whether to use recompute, default 0, means to disable recompute.
recompute_ctx (dict, optional): The context for recompute, if recompute_interval > 1, recompute_ctx must be given.
Examples: Examples:
.. code-block:: python .. code-block:: python
from paddle.nn import layer, LayerList
from paddle.distributed.moe import MoElayer
from paddle.distributed.collective import Group
from paddle.distributed import fleet
moe_group = Group(fleet.worker_index(),
0,
list(range(fleet.worker_num())))
mp_group = None
num_experts=8
dim_feedforward=512
d_model=8
top_k=2
class ExpertLayer(Layer):
def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
super().__init__()
self.htoh4 = nn.Linear(d_model, d_hidden)
self.h4toh = nn.Linear(d_hidden, d_model)
def forward(self, x):
x = self.htoh4(x)
x = self.h4toh(x)
return x
gate_config = { >>> # doctest: +SKIP('Until Distributed move successfully, just skip it')
"type": "gshard", >>> from paddle.nn import layer, LayerList
"top_k": top_k, >>> from paddle.distributed.moe import MoElayer
} >>> from paddle.distributed.collective import Group
>>> from paddle.distributed import fleet
experts_list = LayerList()
for expi in range(num_experts): >>> moe_group = Group(fleet.worker_index(),
exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts) ... 0,
experts_list.append(exp_layer) ... list(range(fleet.worker_num())))
>>> mp_group = None
moeLayer = MoELayer(d_model = d_model,
experts=experts_list, >>> num_experts=8
gate=gate_config, >>> dim_feedforward=512
moe_group=moe_group, >>> d_model=8
mp_group=mp_group, >>> top_k=2
recompute_interval=0)
>>> class ExpertLayer(Layer):
... def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
... super().__init__()
... self.htoh4 = nn.Linear(d_model, d_hidden)
... self.h4toh = nn.Linear(d_hidden, d_model)
... def forward(self, x):
... x = self.htoh4(x)
... x = self.h4toh(x)
... return x
>>> gate_config = {
... "type": "gshard",
... "top_k": top_k,
... }
>>> experts_list = LayerList()
>>> for expi in range(num_experts):
... exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
... experts_list.append(exp_layer)
>>> moeLayer = MoELayer(d_model = d_model,
... experts=experts_list,
... gate=gate_config,
... moe_group=moe_group,
... mp_group=mp_group,
... recompute_interval=0)
""" """
......
此差异已折叠。
...@@ -51,15 +51,27 @@ def fused_dropout_add( ...@@ -51,15 +51,27 @@ def fused_dropout_add(
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
from paddle.incubate.nn.functional import fused_dropout_add >>> from paddle.incubate.nn.functional import fused_dropout_add
x = paddle.randn([4, 10], dtype='float16') >>> paddle.set_device('gpu')
y = paddle.randn([4, 10], dtype='float16') >>> paddle.seed(2023)
out = fused_dropout_add(x, y, p=0.5) >>> x = paddle.randn([4, 10], dtype="float32")
>>> y = paddle.randn([4, 10], dtype="float32")
>>> out = fused_dropout_add(x, y, p=0.5)
>>> print(out)
Tensor(shape=[4, 10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
[[-0.49133155, 0.53819323, -2.58393312, 0.06336236, -1.09908366,
0.22085167, 2.19751787, 0.05034769, 0.53417486, 0.84864247],
[ 0.78248203, -1.59652555, -0.14399840, -0.77985179, -0.17006736,
-0.30991879, -0.36593807, -0.51025450, 1.46401680, 0.61627960],
[ 4.50472546, -0.48472026, 0.60729283, 0.33509624, -0.25593102,
-1.45173049, 1.06727099, 0.00440830, -0.77340341, 0.67393088],
[ 1.29453969, 0.07568165, 0.71947742, -0.71768606, -2.57172823,
1.89179027, 3.26482797, 1.10493207, -1.04569530, -1.04862499]])
""" """
if isinstance(p, (int, float)): if isinstance(p, (int, float)):
# fast return for p == 0 # fast return for p == 0
......
...@@ -37,25 +37,20 @@ def fused_ec_moe( ...@@ -37,25 +37,20 @@ def fused_ec_moe(
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
from paddle.incubate.nn.functional import fused_ec_moe >>> from paddle.incubate.nn.functional import fused_ec_moe
batch = 10 >>> paddle.set_device('gpu')
seq_len = 128 >>> x = paddle.randn([10, 128, 1024])
d_model = 1024 >>> gate = paddle.randn([10, 128, 8])
d_feed_forward = d_model * 4 >>> bmm0_weight = paddle.randn([8, 1024, 4096])
num_expert = 8 >>> bmm0_bias = paddle.randn([8, 1024, 4096])
>>> bmm1_weight = paddle.randn([8, 1024, 4096])
x = paddle.randn([batch, seq_len, d_model]) >>> bmm1_bias = paddle.randn([8, 1024, 4096])
gate = paddle.randn([batch, seq_len, num_expert]) >>> out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
bmm0_weight = paddle.randn([num_expert, d_model, d_feed_forward]) >>> print(out.shape)
bmm0_bias = paddle.randn([num_expert, d_model, d_feed_forward]) [10, 128, 1024]
bmm1_weight = paddle.randn([num_expert, d_model, d_feed_forward])
bmm1_bias = paddle.randn([num_expert, d_model, d_feed_forward])
out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
print(out.shape) # [batch, seq_len, num_expert]
""" """
helper = LayerHelper('fused_moe', **locals()) helper = LayerHelper('fused_moe', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
......
...@@ -39,7 +39,7 @@ def fused_gate_attention( ...@@ -39,7 +39,7 @@ def fused_gate_attention(
to information from different representation subspaces. This API only to information from different representation subspaces. This API only
support self_attention. The pseudo code is as follows: support self_attention. The pseudo code is as follows:
.. code-block:: python .. code-block:: text
c = c ** (-0.5) c = c ** (-0.5)
q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c
...@@ -64,20 +64,20 @@ def fused_gate_attention( ...@@ -64,20 +64,20 @@ def fused_gate_attention(
Args: Args:
query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim]. query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim].
key (Tensor, optional): The input key tensor, which can be set when key (Tensor, optional): The input key tensor, which can be set when
merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. Default None.
query_weight (Tensor, optional): The weight of query linear, which query_weight (Tensor, optional): The weight of query linear, which should be set when input
should be set when input key is not None. The shape is [q_dim, num_heads, head_dim]. key is not None. The shape is [q_dim, num_heads, head_dim]. Default None.
key_weight (Tensor, optional): The weight of key linear, which should key_weight (Tensor, optional): The weight of key linear, which should be set when input key
be set when input key is not None. The shape is [kv_dim, num_heads, head_dim]. is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
value_weight (Tensor, optional): The weight of value linear, which should value_weight (Tensor, optional): The weight of value linear, which should be set when input
be set when input key is not None. The shape is [kv_dim, num_heads, head_dim]. key is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
qkv_weight (Tensor, optional): The weight of qkv linear, which should qkv_weight (Tensor, optional): The weight of qkv linear, which should be set when merge_qkv
be set when merge_qkv is True. The shape is [3, num_heads, head_dim, q_dim]. is True. The shape is [3, num_heads, head_dim, q_dim]. Default None.
gate_linear_weight (Tensor, optional): The weight of gating linear, gate_linear_weight (Tensor, optional): The weight of gating linear, which should be set when
which should be set when has_gating is True. The shape is [q_dim, num_heads, head_dim]. has_gating is True. The shape is [q_dim, num_heads, head_dim]. Default None.
gate_linear_bias (Tensor, optional): The bias of gating linear, which gate_linear_bias (Tensor, optional): The bias of gating linear, which should be set when
should be set when has_gating is True. The shape is [num_heads, head_dim]. Default None. has_gating is True. The shape is [num_heads, head_dim]. Default None.
out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. Default None.
out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None. out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None.
nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None. nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None.
attn_mask (Tensor, optional): The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None. attn_mask (Tensor, optional): The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None.
...@@ -92,54 +92,54 @@ def fused_gate_attention( ...@@ -92,54 +92,54 @@ def fused_gate_attention(
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +REQUIRES(env:GPU)
import paddle >>> import paddle
import paddle.incubate.nn.functional as F >>> import paddle.incubate.nn.functional as F
# batch_size = 2 >>> # batch_size = 2
# msa_len = 4 >>> # msa_len = 4
# res_len = 2 >>> # res_len = 2
# q_dim = 4 >>> # q_dim = 4
# num_heads = 8 >>> # num_heads = 8
# head_dim = 4 >>> # head_dim = 4
# m_size = res_len (when merge_qkv is True) >>> # m_size = res_len (when merge_qkv is True)
# query: [batch_size, msa_len, res_len, q_dim] >>> # query: [batch_size, msa_len, res_len, q_dim]
query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32") >>> query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
# qkv_weight: [3, n_heads, head_dim, q_dim] >>> # qkv_weight: [3, n_heads, head_dim, q_dim]
qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32") >>> qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
# nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size] >>> # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32") >>> nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
# attn_mask: [batch_size, msa_len, 1, 1, m_size] >>> # attn_mask: [batch_size, msa_len, 1, 1, m_size]
attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32") >>> attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
# gate_linear_weight: [q_dim, num_heads, head_dim] >>> # gate_linear_weight: [q_dim, num_heads, head_dim]
gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32") >>> gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
# gate_bias: [num_heads, head_dim] >>> # gate_bias: [num_heads, head_dim]
gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32") >>> gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
# out_linear_weight: [num_heads, head_dim, q_dim] >>> # out_linear_weight: [num_heads, head_dim, q_dim]
out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32") >>> out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
# out_linear_bias: [q_dim] >>> # out_linear_bias: [q_dim]
out_linear_bias = paddle.rand(shape=[4], dtype="float32") >>> out_linear_bias = paddle.rand(shape=[4], dtype="float32")
# output: [batch_size, msa_len, res_len, q_dim] >>> # output: [batch_size, msa_len, res_len, q_dim]
output = F.fused_gate_attention( >>> output = F.fused_gate_attention(
query=query, ... query=query,
qkv_weight=qkv_weight, ... qkv_weight=qkv_weight,
gate_linear_weight=gate_linear_weight, ... gate_linear_weight=gate_linear_weight,
gate_linear_bias=gate_linear_bias, ... gate_linear_bias=gate_linear_bias,
out_linear_weight=out_linear_weight, ... out_linear_weight=out_linear_weight,
out_linear_bias=out_linear_bias, ... out_linear_bias=out_linear_bias,
nonbatched_bias=nonbatched_bias, ... nonbatched_bias=nonbatched_bias,
attn_mask=attn_mask, ... attn_mask=attn_mask,
has_gating=True, ... has_gating=True,
merge_qkv=True) ... merge_qkv=True)
print(output.shape) >>> print(output.shape)
# [2, 4, 2, 4] [2, 4, 2, 4]
""" """
if in_dynamic_mode(): if in_dynamic_mode():
......
...@@ -28,11 +28,11 @@ def fused_matmul_bias( ...@@ -28,11 +28,11 @@ def fused_matmul_bias(
Args: Args:
x (Tensor): the first input Tensor to be multiplied. x (Tensor): the first input Tensor to be multiplied.
y (Tensor): the second input Tensor to be multiplied. Its rank must be 2. y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result. be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
transpose_x (bool): Whether to transpose :math:`x` before multiplication. transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default: False.
transpose_y (bool): Whether to transpose :math:`y` before multiplication. transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default: False.
name(str|None): For detailed information, please refer to name (str, optional): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default. :ref:`api_guide_Name` . Usually name is no need to set and None by default.
Returns: Returns:
...@@ -41,15 +41,18 @@ def fused_matmul_bias( ...@@ -41,15 +41,18 @@ def fused_matmul_bias(
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
import paddle >>> # doctest: +REQUIRES(env:GPU)
from paddle.incubate.nn.functional import fused_matmul_bias >>> import paddle
>>> from paddle.incubate.nn.functional import fused_matmul_bias
x = paddle.randn([3, 4])
y = paddle.randn([4, 5]) >>> paddle.set_device('gpu')
bias = paddle.randn([5]) >>> x = paddle.randn([3, 5])
out = fused_matmul_bias(x, y, bias) >>> y = paddle.randn([4, 5])
print(out.shape) # [3, 5] >>> bias = paddle.randn([5])
>>> out = fused_matmul_bias(x, y, bias)
>>> print(out.shape)
[3, 5]
""" """
if bias is None: if bias is None:
return matmul(x, y, transpose_x, transpose_y, name) return matmul(x, y, transpose_x, transpose_y, name)
...@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None): ...@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
Args: Args:
x (Tensor): the input Tensor to be multiplied. x (Tensor): the input Tensor to be multiplied.
weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2. weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result. be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
transpose_weight (bool): Whether to transpose :math:`weight` before multiplication. transpose_weight (bool, optional): Whether to transpose :math:`weight` before multiplication. Default: False.
name(str|None): For detailed information, please refer to name (str, optional): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default. :ref:`api_guide_Name` . Usually name is no need to set and None by default.
Returns: Returns:
...@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None): ...@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
import paddle >>> # doctest: +REQUIRES(env:GPU)
from paddle.incubate.nn.functional import fused_linear >>> import paddle
>>> from paddle.incubate.nn.functional import fused_linear
x = paddle.randn([3, 4])
weight = paddle.randn([4, 5]) >>> paddle.set_device('gpu')
bias = paddle.randn([5]) >>> x = paddle.randn([3, 4])
out = fused_linear(x, weight, bias) >>> weight = paddle.randn([4, 5])
print(out.shape) # [3, 5] >>> bias = paddle.randn([5])
>>> out = fused_linear(x, weight, bias)
>>> print(out.shape)
[3, 5]
""" """
return fused_matmul_bias(x, weight, bias, False, transpose_weight, name) return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
...@@ -109,25 +115,32 @@ def fused_linear_activation( ...@@ -109,25 +115,32 @@ def fused_linear_activation(
Args: Args:
x (Tensor): the input Tensor to be multiplied. x (Tensor): the input Tensor to be multiplied.
weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2. y (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result. bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result.
transpose_weight (bool): Whether to transpose :math:`weight` before multiplication. trans_x (bool, optional): Whether to transpose :math:`x` before multiplication.
activation(str|None): Activation function, Currently, the available activation functions are limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit). These activation functions are applied to the output of the bias add. trans_y (bool, optional): Whether to transpose :math:`y` before multiplication.
activation (str, optional): Activation function, Currently, the available activation functions are
limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit).
These activation functions are applied to the output of the bias add. Default: None.
Returns: Returns:
Tensor: the output Tensor. Tensor: the output Tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
import paddle >>> # doctest: +REQUIRES(env:GPU)
from paddle.incubate.nn.functional import fused_linear_activation >>> import paddle
>>> from paddle.incubate.nn.functional import fused_linear_activation
x = paddle.randn([3, 4])
weight = paddle.randn([4, 5]) >>> paddle.set_device('gpu')
bias = paddle.randn([5]) >>> x = paddle.randn([3, 4])
out = fused_linear_activation(x, weight, bias) >>> weight = paddle.randn([4, 5])
print(out.shape) # [3, 5] >>> bias = paddle.randn([5])
>>> out = fused_linear_activation(x, weight, bias)
>>> print(out.shape)
[3, 5]
""" """
if activation is None: if activation is None:
activation = "none" activation = "none"
......
...@@ -44,14 +44,13 @@ def fused_rotary_position_embedding( ...@@ -44,14 +44,13 @@ def fused_rotary_position_embedding(
Examples: Examples:
.. code-block:: python .. code-block:: python
>>> # required: gpu
>>> # doctest: +REQUIRES(env:GPU) >>> # doctest: +REQUIRES(env:GPU)
>>> import paddle >>> import paddle
>>> from paddle.incubate.nn.functional import fused_rotary_position_embedding >>> from paddle.incubate.nn.functional import fused_rotary_position_embedding
>>> paddle.device.set_device('gpu') >>> paddle.set_device('gpu')
>>> # batch_size = 2 >>> # batch_size = 2
>>> # seq_len = 2 >>> # seq_len = 2
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册