未验证 提交 4dbe441c 编写于 作者: C cyberslack_lee 提交者: GitHub

[xdoctest] reformat example code with google style in No. 250-260 (#56541)

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* fix

* test=docs_preview

* test=docs_preview

* fix

* move stmts under imports

---------
Co-authored-by: NSigureMo <sigure.qaq@gmail.com>
上级 7314cf69
......@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm
class ClipGradForMOEByGlobalNorm(ClipGradBase):
r"""
The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
......@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
Reference:
https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
......@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.
Examples:
.. code-block:: python
import paddle
>>> import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
>>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
>>> linear = paddle.nn.Linear(in_features=10, out_features=10,
... weight_attr=paddle.ParamAttr(need_clip=True),
... bias_attr=paddle.ParamAttr(need_clip=False))
>>> out = linear(x)
>>> loss = paddle.mean(out)
>>> loss.backward()
is_expert_func = lambda param: "expert_" in param.name
clip = paddle.nn.ClipGradForMOEByGlobalNorm(clip_norm=1.0,is_expert_func, None)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
>>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
>>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
>>> sdg.step()
"""
def __init__(
......@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
else:
sum_square_list.append(sum_square)
# all parameters have been filterd out
# all parameters have been filtered out
if (
len(sum_square_list)
+ len(sum_square_list_fp16)
......
......@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
class MoELayer(nn.Layer):
"""MoE Layer
Args:
d_model: (int) model dimention
experts: (nn.LayerList) expert networks list
gate: (dict|NaiveGate|SwitchGate|NaiveGate):
if gate is a dict:
gate is a gate network config, containing 2 keys:
`type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard"
`top_k`(int) default value is 2
else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
moe_group: moe group for experts communication
mp_group: mp group for mp commutication
recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.
recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.
d_model (int): Model dimention.
experts (nn.LayerList): Expert networks list.
gate (dict|NaiveGate|SwitchGate|NaiveGate):
- If gate is a dict:
gate is a gate network config, containing 2 keys:
`type` (str) value can be: "naive", "gshard", "switch" or None, default is "gshard".
`top_k` (int) Default value is 2.
else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
moe_group: moe group for experts communication.
mp_group: mp group for mp communication.
recompute_interval (int, optional): Whether to use recompute, default 0, means to disable recompute.
recompute_ctx (dict, optional): The context for recompute, if recompute_interval > 1, recompute_ctx must be given.
Examples:
.. code-block:: python
from paddle.nn import layer, LayerList
from paddle.distributed.moe import MoElayer
from paddle.distributed.collective import Group
from paddle.distributed import fleet
moe_group = Group(fleet.worker_index(),
0,
list(range(fleet.worker_num())))
mp_group = None
num_experts=8
dim_feedforward=512
d_model=8
top_k=2
class ExpertLayer(Layer):
def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
super().__init__()
self.htoh4 = nn.Linear(d_model, d_hidden)
self.h4toh = nn.Linear(d_hidden, d_model)
def forward(self, x):
x = self.htoh4(x)
x = self.h4toh(x)
return x
gate_config = {
"type": "gshard",
"top_k": top_k,
}
experts_list = LayerList()
for expi in range(num_experts):
exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
experts_list.append(exp_layer)
moeLayer = MoELayer(d_model = d_model,
experts=experts_list,
gate=gate_config,
moe_group=moe_group,
mp_group=mp_group,
recompute_interval=0)
>>> # doctest: +SKIP('Until Distributed move successfully, just skip it')
>>> from paddle.nn import layer, LayerList
>>> from paddle.distributed.moe import MoElayer
>>> from paddle.distributed.collective import Group
>>> from paddle.distributed import fleet
>>> moe_group = Group(fleet.worker_index(),
... 0,
... list(range(fleet.worker_num())))
>>> mp_group = None
>>> num_experts=8
>>> dim_feedforward=512
>>> d_model=8
>>> top_k=2
>>> class ExpertLayer(Layer):
... def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
... super().__init__()
... self.htoh4 = nn.Linear(d_model, d_hidden)
... self.h4toh = nn.Linear(d_hidden, d_model)
... def forward(self, x):
... x = self.htoh4(x)
... x = self.h4toh(x)
... return x
>>> gate_config = {
... "type": "gshard",
... "top_k": top_k,
... }
>>> experts_list = LayerList()
>>> for expi in range(num_experts):
... exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
... experts_list.append(exp_layer)
>>> moeLayer = MoELayer(d_model = d_model,
... experts=experts_list,
... gate=gate_config,
... moe_group=moe_group,
... mp_group=mp_group,
... recompute_interval=0)
"""
......
此差异已折叠。
......@@ -51,15 +51,27 @@ def fused_dropout_add(
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn.functional import fused_dropout_add
x = paddle.randn([4, 10], dtype='float16')
y = paddle.randn([4, 10], dtype='float16')
out = fused_dropout_add(x, y, p=0.5)
.. code-block:: python
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_dropout_add
>>> paddle.set_device('gpu')
>>> paddle.seed(2023)
>>> x = paddle.randn([4, 10], dtype="float32")
>>> y = paddle.randn([4, 10], dtype="float32")
>>> out = fused_dropout_add(x, y, p=0.5)
>>> print(out)
Tensor(shape=[4, 10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
[[-0.49133155, 0.53819323, -2.58393312, 0.06336236, -1.09908366,
0.22085167, 2.19751787, 0.05034769, 0.53417486, 0.84864247],
[ 0.78248203, -1.59652555, -0.14399840, -0.77985179, -0.17006736,
-0.30991879, -0.36593807, -0.51025450, 1.46401680, 0.61627960],
[ 4.50472546, -0.48472026, 0.60729283, 0.33509624, -0.25593102,
-1.45173049, 1.06727099, 0.00440830, -0.77340341, 0.67393088],
[ 1.29453969, 0.07568165, 0.71947742, -0.71768606, -2.57172823,
1.89179027, 3.26482797, 1.10493207, -1.04569530, -1.04862499]])
"""
if isinstance(p, (int, float)):
# fast return for p == 0
......
......@@ -37,25 +37,20 @@ def fused_ec_moe(
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn.functional import fused_ec_moe
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_ec_moe
batch = 10
seq_len = 128
d_model = 1024
d_feed_forward = d_model * 4
num_expert = 8
x = paddle.randn([batch, seq_len, d_model])
gate = paddle.randn([batch, seq_len, num_expert])
bmm0_weight = paddle.randn([num_expert, d_model, d_feed_forward])
bmm0_bias = paddle.randn([num_expert, d_model, d_feed_forward])
bmm1_weight = paddle.randn([num_expert, d_model, d_feed_forward])
bmm1_bias = paddle.randn([num_expert, d_model, d_feed_forward])
out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
print(out.shape) # [batch, seq_len, num_expert]
>>> paddle.set_device('gpu')
>>> x = paddle.randn([10, 128, 1024])
>>> gate = paddle.randn([10, 128, 8])
>>> bmm0_weight = paddle.randn([8, 1024, 4096])
>>> bmm0_bias = paddle.randn([8, 1024, 4096])
>>> bmm1_weight = paddle.randn([8, 1024, 4096])
>>> bmm1_bias = paddle.randn([8, 1024, 4096])
>>> out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
>>> print(out.shape)
[10, 128, 1024]
"""
helper = LayerHelper('fused_moe', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
......
......@@ -39,7 +39,7 @@ def fused_gate_attention(
to information from different representation subspaces. This API only
support self_attention. The pseudo code is as follows:
.. code-block:: python
.. code-block:: text
c = c ** (-0.5)
q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c
......@@ -64,20 +64,20 @@ def fused_gate_attention(
Args:
query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim].
key (Tensor, optional): The input key tensor, which can be set when
merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim].
query_weight (Tensor, optional): The weight of query linear, which
should be set when input key is not None. The shape is [q_dim, num_heads, head_dim].
key_weight (Tensor, optional): The weight of key linear, which should
be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
value_weight (Tensor, optional): The weight of value linear, which should
be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
qkv_weight (Tensor, optional): The weight of qkv linear, which should
be set when merge_qkv is True. The shape is [3, num_heads, head_dim, q_dim].
gate_linear_weight (Tensor, optional): The weight of gating linear,
which should be set when has_gating is True. The shape is [q_dim, num_heads, head_dim].
gate_linear_bias (Tensor, optional): The bias of gating linear, which
should be set when has_gating is True. The shape is [num_heads, head_dim]. Default None.
out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim].
merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. Default None.
query_weight (Tensor, optional): The weight of query linear, which should be set when input
key is not None. The shape is [q_dim, num_heads, head_dim]. Default None.
key_weight (Tensor, optional): The weight of key linear, which should be set when input key
is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
value_weight (Tensor, optional): The weight of value linear, which should be set when input
key is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
qkv_weight (Tensor, optional): The weight of qkv linear, which should be set when merge_qkv
is True. The shape is [3, num_heads, head_dim, q_dim]. Default None.
gate_linear_weight (Tensor, optional): The weight of gating linear, which should be set when
has_gating is True. The shape is [q_dim, num_heads, head_dim]. Default None.
gate_linear_bias (Tensor, optional): The bias of gating linear, which should be set when
has_gating is True. The shape is [num_heads, head_dim]. Default None.
out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. Default None.
out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None.
nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None.
attn_mask (Tensor, optional): The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None.
......@@ -92,54 +92,54 @@ def fused_gate_attention(
.. code-block:: python
# required: gpu
import paddle
import paddle.incubate.nn.functional as F
# batch_size = 2
# msa_len = 4
# res_len = 2
# q_dim = 4
# num_heads = 8
# head_dim = 4
# m_size = res_len (when merge_qkv is True)
# query: [batch_size, msa_len, res_len, q_dim]
query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
# qkv_weight: [3, n_heads, head_dim, q_dim]
qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
# nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
# attn_mask: [batch_size, msa_len, 1, 1, m_size]
attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
# gate_linear_weight: [q_dim, num_heads, head_dim]
gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
# gate_bias: [num_heads, head_dim]
gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
# out_linear_weight: [num_heads, head_dim, q_dim]
out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
# out_linear_bias: [q_dim]
out_linear_bias = paddle.rand(shape=[4], dtype="float32")
# output: [batch_size, msa_len, res_len, q_dim]
output = F.fused_gate_attention(
query=query,
qkv_weight=qkv_weight,
gate_linear_weight=gate_linear_weight,
gate_linear_bias=gate_linear_bias,
out_linear_weight=out_linear_weight,
out_linear_bias=out_linear_bias,
nonbatched_bias=nonbatched_bias,
attn_mask=attn_mask,
has_gating=True,
merge_qkv=True)
print(output.shape)
# [2, 4, 2, 4]
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> import paddle.incubate.nn.functional as F
>>> # batch_size = 2
>>> # msa_len = 4
>>> # res_len = 2
>>> # q_dim = 4
>>> # num_heads = 8
>>> # head_dim = 4
>>> # m_size = res_len (when merge_qkv is True)
>>> # query: [batch_size, msa_len, res_len, q_dim]
>>> query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
>>> # qkv_weight: [3, n_heads, head_dim, q_dim]
>>> qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
>>> # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
>>> nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
>>> # attn_mask: [batch_size, msa_len, 1, 1, m_size]
>>> attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
>>> # gate_linear_weight: [q_dim, num_heads, head_dim]
>>> gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
>>> # gate_bias: [num_heads, head_dim]
>>> gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
>>> # out_linear_weight: [num_heads, head_dim, q_dim]
>>> out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
>>> # out_linear_bias: [q_dim]
>>> out_linear_bias = paddle.rand(shape=[4], dtype="float32")
>>> # output: [batch_size, msa_len, res_len, q_dim]
>>> output = F.fused_gate_attention(
... query=query,
... qkv_weight=qkv_weight,
... gate_linear_weight=gate_linear_weight,
... gate_linear_bias=gate_linear_bias,
... out_linear_weight=out_linear_weight,
... out_linear_bias=out_linear_bias,
... nonbatched_bias=nonbatched_bias,
... attn_mask=attn_mask,
... has_gating=True,
... merge_qkv=True)
>>> print(output.shape)
[2, 4, 2, 4]
"""
if in_dynamic_mode():
......
......@@ -28,11 +28,11 @@ def fused_matmul_bias(
Args:
x (Tensor): the first input Tensor to be multiplied.
y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result.
transpose_x (bool): Whether to transpose :math:`x` before multiplication.
transpose_y (bool): Whether to transpose :math:`y` before multiplication.
name(str|None): For detailed information, please refer to
bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default: False.
transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default: False.
name (str, optional): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default.
Returns:
......@@ -41,15 +41,18 @@ def fused_matmul_bias(
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn.functional import fused_matmul_bias
x = paddle.randn([3, 4])
y = paddle.randn([4, 5])
bias = paddle.randn([5])
out = fused_matmul_bias(x, y, bias)
print(out.shape) # [3, 5]
>>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_matmul_bias
>>> paddle.set_device('gpu')
>>> x = paddle.randn([3, 5])
>>> y = paddle.randn([4, 5])
>>> bias = paddle.randn([5])
>>> out = fused_matmul_bias(x, y, bias)
>>> print(out.shape)
[3, 5]
"""
if bias is None:
return matmul(x, y, transpose_x, transpose_y, name)
......@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
Args:
x (Tensor): the input Tensor to be multiplied.
weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result.
transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
name(str|None): For detailed information, please refer to
bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
transpose_weight (bool, optional): Whether to transpose :math:`weight` before multiplication. Default: False.
name (str, optional): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default.
Returns:
......@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn.functional import fused_linear
x = paddle.randn([3, 4])
weight = paddle.randn([4, 5])
bias = paddle.randn([5])
out = fused_linear(x, weight, bias)
print(out.shape) # [3, 5]
>>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_linear
>>> paddle.set_device('gpu')
>>> x = paddle.randn([3, 4])
>>> weight = paddle.randn([4, 5])
>>> bias = paddle.randn([5])
>>> out = fused_linear(x, weight, bias)
>>> print(out.shape)
[3, 5]
"""
return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
......@@ -109,25 +115,32 @@ def fused_linear_activation(
Args:
x (Tensor): the input Tensor to be multiplied.
weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
y (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result.
transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
activation(str|None): Activation function, Currently, the available activation functions are limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit). These activation functions are applied to the output of the bias add.
trans_x (bool, optional): Whether to transpose :math:`x` before multiplication.
trans_y (bool, optional): Whether to transpose :math:`y` before multiplication.
activation (str, optional): Activation function, Currently, the available activation functions are
limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit).
These activation functions are applied to the output of the bias add. Default: None.
Returns:
Tensor: the output Tensor.
Examples:
.. code-block:: python
# required: gpu
import paddle
from paddle.incubate.nn.functional import fused_linear_activation
x = paddle.randn([3, 4])
weight = paddle.randn([4, 5])
bias = paddle.randn([5])
out = fused_linear_activation(x, weight, bias)
print(out.shape) # [3, 5]
>>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_linear_activation
>>> paddle.set_device('gpu')
>>> x = paddle.randn([3, 4])
>>> weight = paddle.randn([4, 5])
>>> bias = paddle.randn([5])
>>> out = fused_linear_activation(x, weight, bias)
>>> print(out.shape)
[3, 5]
"""
if activation is None:
activation = "none"
......
......@@ -44,14 +44,13 @@ def fused_rotary_position_embedding(
Examples:
.. code-block:: python
.. code-block:: python
>>> # required: gpu
>>> # doctest: +REQUIRES(env:GPU)
>>> import paddle
>>> from paddle.incubate.nn.functional import fused_rotary_position_embedding
>>> paddle.device.set_device('gpu')
>>> paddle.set_device('gpu')
>>> # batch_size = 2
>>> # seq_len = 2
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册