未验证 提交 aac7879a 编写于 作者: R Roc 提交者: GitHub

[MoE] Moe apis (#41092)

* add random routing op

add _random_routing api in utils

add random routing ut

* # This is a combination of 10 commits.
# The first commit's message is:
add expert count op

add ut for expert_count

# This is the 2nd commit message:

update UT only for cuda

# This is the 3rd commit message:

fix for rocm

# This is the 4th commit message:

update ut

# This is the 5th commit message:

add moe module

# This is the 6th commit message:

add expert count op

add ut for expert_count

# This is the 7th commit message:

update UT only for cuda

# This is the 8th commit message:

update ut

# This is the 9th commit message:

add moe module

# This is the 10th commit message:

make expert count private

* add assign pos op

* fix upper num name

* add api _assign pos

* add ut for assign pos op

* update date

* add op about moe gate

update utils

add limit by capacity op

add ut for limit_by_capacity

add ut for prune_gate_by_capacity

add ut for limit_by_capacity

add ut for prune_gate_by_capacity

* fix for win

* fix bugs in test_limit_by_capacity_op

* update ut

* update for test (timeout)

* fix ut

* update

* update(fix) ut for win

* moe apis in incubate

* # This is a combination of 10 commits.
# The first commit's message is:
add expert count op

add ut for expert_count

# This is the 2nd commit message:

update UT only for cuda

# This is the 3rd commit message:

fix for rocm

# This is the 4th commit message:

update ut

# This is the 5th commit message:

add moe module

# This is the 6th commit message:

add expert count op

add ut for expert_count

# This is the 7th commit message:

update UT only for cuda

# This is the 8th commit message:

update ut

# This is the 9th commit message:

add moe module

# This is the 10th commit message:

make expert count private

* add assign pos op

* fix upper num name

* add api _assign pos

* add ut for assign pos op

* update date

* fix for win

* update for test (timeout)

* fix ut

* update

* fix ut for number count

* add apis and utils

* add gate apis

* add moe and grad clip apis

* update moe apis

* add ops for moe gate

* fix

* update for base moe layer api

* add random routing op

add _random_routing api in utils

add random routing ut

* fix for dygraph

* update with ranodm routing

* update

* fix ut for limit by capacity

* update

* update limit by capacity for easily to switch to single thread mode

* update api docs
Co-authored-by: Nhlygit66666 <2570058140@qq.com>
上级 8f7c02f2
...@@ -20,19 +20,17 @@ ...@@ -20,19 +20,17 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1)
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T> template <typename T>
__global__ void limit_by_capacity_impl(const T* expc, T* cap, T* out, __global__ void limit_by_capacity_impl(const T* expc, T* cap, T* out,
const int n_expert, const int n_worker) { const int n_expert, const int n_worker) {
int eid = blockIdx.y; int eid, wid;
int wid = blockIdx.x * blockDim.x + threadIdx.x; CUDA_KERNEL_LOOP(i, (n_expert * n_worker)) {
if (wid < n_worker) { wid = i / n_expert;
eid = i % n_expert;
auto proposal = expc[wid * n_expert + eid]; auto proposal = expc[wid * n_expert + eid];
// int cap_left = atomicSub(cap + eid, proposal);
auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1));
if (cap_left >= proposal) { if (cap_left >= proposal) {
out[wid * n_expert + eid] = proposal; out[wid * n_expert + eid] = proposal;
...@@ -54,12 +52,11 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> { ...@@ -54,12 +52,11 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
auto out = context.Output<Tensor>("Out"); auto out = context.Output<Tensor>("Out");
auto n_expert = expert_count->numel() / n_worker; auto n_expert = expert_count->numel() / n_worker;
// std::cout << "n_expert" << n_expert << std::endl;
const auto place = context.GetPlace(); const auto place = context.GetPlace();
const auto& dev_ctx = const auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
dim3 grid_dim(CEIL(n_worker, 1024), n_expert); dim3 grid_dim(256);
dim3 block_dim(1024); dim3 block_dim(1024);
auto out_data = out->mutable_data<T>(place); auto out_data = out->mutable_data<T>(place);
const T* ec_data = expert_count->data<T>(); const T* ec_data = expert_count->data<T>();
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
......
...@@ -30,8 +30,6 @@ from .utils import count_by_gate ...@@ -30,8 +30,6 @@ from .utils import count_by_gate
from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute
from paddle import fluid from paddle import fluid
__all__ = ["MoeLayer"]
def _local_scatter(inp, pos): def _local_scatter(inp, pos):
if pos.shape != [0]: if pos.shape != [0]:
...@@ -71,7 +69,7 @@ def _all_gather(tensor, group=None, use_calc_stream=True): ...@@ -71,7 +69,7 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
'ring_id', ring_id, 'nranks', nranks) 'ring_id', ring_id, 'nranks', nranks)
class MOEScatter(PyLayer): class MoEScatter(PyLayer):
r""" r"""
Scatter input samples from [batch x sequences] to contiguous alone experts. Scatter input samples from [batch x sequences] to contiguous alone experts.
If `world_size` is greater than 1, the samples will first be locally If `world_size` is greater than 1, the samples will first be locally
...@@ -117,10 +115,10 @@ class MOEScatter(PyLayer): ...@@ -117,10 +115,10 @@ class MOEScatter(PyLayer):
return grad_in, None, None, None return grad_in, None, None, None
class MOEGather(PyLayer): class MoEGather(PyLayer):
r""" r"""
Gather output samples from contiguous alone experts back to [batch x Gather output samples from contiguous alone experts back to [batch x
sequences]. Works symmetrically with MOEScatter. sequences]. Works symmetrically with MoEScatter.
""" """
@staticmethod @staticmethod
...@@ -225,8 +223,8 @@ def prepare_forward(gate, num_expert, world_size, moe_group): ...@@ -225,8 +223,8 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
fwd_batch_size, ) fwd_batch_size, )
class MoeLayer(nn.Layer): class MoELayer(nn.Layer):
"""Moe Layer """MoE Layer
Args: Args:
d_model: (int) model dimention d_model: (int) model dimention
experts: (nn.LayerList) expert networks list experts: (nn.LayerList) expert networks list
...@@ -243,7 +241,7 @@ class MoeLayer(nn.Layer): ...@@ -243,7 +241,7 @@ class MoeLayer(nn.Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
from paddle.nn import layer, LayerList from paddle.nn import layer, LayerList
from paddle.distributed.moe import Moelayer from paddle.distributed.moe import MoElayer
from paddle.distributed.collective import Group from paddle.distributed.collective import Group
from paddle.distributed import fleet from paddle.distributed import fleet
...@@ -279,7 +277,7 @@ class MoeLayer(nn.Layer): ...@@ -279,7 +277,7 @@ class MoeLayer(nn.Layer):
exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts) exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
experts_list.append(exp_layer) experts_list.append(exp_layer)
moeLayer = MoeLayer(d_model = d_model, moeLayer = MoELayer(d_model = d_model,
experts=experts_list, experts=experts_list,
gate=gate_config, gate=gate_config,
moe_group=moe_group, moe_group=moe_group,
...@@ -295,7 +293,7 @@ class MoeLayer(nn.Layer): ...@@ -295,7 +293,7 @@ class MoeLayer(nn.Layer):
moe_group=None, moe_group=None,
mp_group=None, mp_group=None,
**kwargs): **kwargs):
super(MoeLayer, self).__init__() super(MoELayer, self).__init__()
recompute_interval = kwargs.get("recompute_interval", 0) recompute_interval = kwargs.get("recompute_interval", 0)
...@@ -385,7 +383,7 @@ class MoeLayer(nn.Layer): ...@@ -385,7 +383,7 @@ class MoeLayer(nn.Layer):
temp_pos = pos temp_pos = pos
assert topk == self.top_k assert topk == self.top_k
x = MOEScatter.apply(inp, temp_pos, local_expert_count, x = MoEScatter.apply(inp, temp_pos, local_expert_count,
global_expert_count, fwd_batch_size, global_expert_count, fwd_batch_size,
self.world_size, self.group) self.world_size, self.group)
...@@ -416,7 +414,7 @@ class MoeLayer(nn.Layer): ...@@ -416,7 +414,7 @@ class MoeLayer(nn.Layer):
if len(gate.shape) == 2: if len(gate.shape) == 2:
out_batch_size *= gate.shape[1] out_batch_size *= gate.shape[1]
x = MOEGather.apply(x, pos, local_expert_count, global_expert_count, x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
out_batch_size, self.world_size, self.group) out_batch_size, self.world_size, self.group)
x = x.reshape([-1, self.top_k, d_model]) x = x.reshape([-1, self.top_k, d_model])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册