From aac7879a605c758b937616e3940902191b469d4b Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Wed, 30 Mar 2022 16:26:02 +0800
Subject: [PATCH] [MoE] Moe apis (#41092)

* add random routing op

add _random_routing api in utils

add random routing ut

* # This is a combination of 10 commits.
# The first commit's message is:
add expert count op

add ut for expert_count

# This is the 2nd commit message:

update UT only for cuda

# This is the 3rd commit message:

fix for rocm

# This is the 4th commit message:

update ut

# This is the 5th commit message:

add moe module

# This is the 6th commit message:

add expert count op

add ut for expert_count

# This is the 7th commit message:

update UT only for cuda

# This is the 8th commit message:

update ut

# This is the 9th commit message:

add moe module

# This is the 10th commit message:

make expert count private

* add assign pos op

* fix upper num name

* add api _assign pos

* add ut for assign pos op

* update date

* add op about moe gate

update utils

add limit by capacity op

add ut for limit_by_capacity

add ut for prune_gate_by_capacity

add ut for limit_by_capacity

add ut for prune_gate_by_capacity

* fix for win

* fix bugs in test_limit_by_capacity_op

* update ut

* update for test (timeout)

* fix ut

* update

* update(fix) ut for win

* moe apis in incubate

* # This is a combination of 10 commits.
# The first commit's message is:
add expert count op

add ut for expert_count

# This is the 2nd commit message:

update UT only for cuda

# This is the 3rd commit message:

fix for rocm

# This is the 4th commit message:

update ut

# This is the 5th commit message:

add moe module

# This is the 6th commit message:

add expert count op

add ut for expert_count

# This is the 7th commit message:

update UT only for cuda

# This is the 8th commit message:

update ut

# This is the 9th commit message:

add moe module

# This is the 10th commit message:

make expert count private

* add assign pos op

* fix upper num name

* add api _assign pos

* add ut for assign pos op

* update date

* fix for win

* update for test (timeout)

* fix ut

* update

* fix ut for number count

* add apis and utils

* add gate apis

* add moe and grad clip apis

* update moe apis

* add ops for moe gate

* fix

* update for base moe layer api

* add random routing op

add _random_routing api in utils

add random routing ut

* fix for dygraph

* update with ranodm routing

* update

* fix ut for limit by capacity

* update

* update limit by capacity for easily to switch to single thread mode

* update api docs

Co-authored-by: hlygit66666 <2570058140@qq.com>
---
 .../fluid/operators/limit_by_capacity_op.cu   | 13 +++++------
 .../distributed/models/moe/gate/__init__.py   |  2 +-
 .../distributed/models/moe/gate/base_gate.py  |  2 +-
 .../models/moe/gate/gshard_gate.py            |  2 +-
 .../distributed/models/moe/gate/naive_gate.py |  2 +-
 .../models/moe/gate/switch_gate.py            |  2 +-
 .../distributed/models/moe/moe_layer.py       | 22 +++++++++----------
 7 files changed, 20 insertions(+), 25 deletions(-)
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index ebc6d1a927..253ae8162c 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -20,19 +20,17 @@
 namespace paddle {
 namespace operators {
 
-#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1)
-
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
 template <typename T>
 __global__ void limit_by_capacity_impl(const T* expc, T* cap, T* out,
                                        const int n_expert, const int n_worker) {
-  int eid = blockIdx.y;
-  int wid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (wid < n_worker) {
+  int eid, wid;
+  CUDA_KERNEL_LOOP(i, (n_expert * n_worker)) {
+    wid = i / n_expert;
+    eid = i % n_expert;
     auto proposal = expc[wid * n_expert + eid];
-    // int cap_left = atomicSub(cap + eid, proposal);
     auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1));
     if (cap_left >= proposal) {
       out[wid * n_expert + eid] = proposal;
@@ -54,12 +52,11 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
     auto out = context.Output<Tensor>("Out");
 
     auto n_expert = expert_count->numel() / n_worker;
-    // std::cout << "n_expert" << n_expert << std::endl;
     const auto place = context.GetPlace();
     const auto& dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
 
-    dim3 grid_dim(CEIL(n_worker, 1024), n_expert);
+    dim3 grid_dim(256);
     dim3 block_dim(1024);
     auto out_data = out->mutable_data<T>(place);
     const T* ec_data = expert_count->data<T>();
diff --git a/python/paddle/incubate/distributed/models/moe/gate/__init__.py b/python/paddle/incubate/distributed/models/moe/gate/__init__.py
index 4dd9205f7c..d4bf666eb6 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/__init__.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
index 046051f6b6..100d201d4b 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
index ea44126379..b1c0cd4214 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
index ac6164ceac..785d2e971b 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
index 94347ea15e..54bf3ab148 100644
--- a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
+++ b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index 6ee2a30589..99cc38d04b 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -30,8 +30,6 @@ from .utils import count_by_gate
 from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute
 from paddle import fluid
 
-__all__ = ["MoeLayer"]
-
 
 def _local_scatter(inp, pos):
     if pos.shape != [0]:
@@ -71,7 +69,7 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
                                      'ring_id', ring_id, 'nranks', nranks)
 
 
-class MOEScatter(PyLayer):
+class MoEScatter(PyLayer):
     r"""
     Scatter input samples from [batch x sequences] to contiguous alone experts.
     If `world_size` is greater than 1, the samples will first be locally
@@ -117,10 +115,10 @@ class MOEScatter(PyLayer):
         return grad_in, None, None, None
 
 
-class MOEGather(PyLayer):
+class MoEGather(PyLayer):
     r"""
     Gather output samples from contiguous alone experts back to [batch x
-    sequences]. Works symmetrically with MOEScatter.
+    sequences]. Works symmetrically with MoEScatter.
     """
 
     @staticmethod
@@ -225,8 +223,8 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
         fwd_batch_size, )
 
 
-class MoeLayer(nn.Layer):
-    """Moe Layer
+class MoELayer(nn.Layer):
+    """MoE Layer
     Args:
         d_model: (int) model dimention
         experts: (nn.LayerList) expert networks list
@@ -243,7 +241,7 @@ class MoeLayer(nn.Layer):
     Examples:
         .. code-block:: python
         from paddle.nn import layer, LayerList
-        from paddle.distributed.moe import Moelayer
+        from paddle.distributed.moe import MoElayer
         from paddle.distributed.collective import Group
         from paddle.distributed import fleet
 
@@ -279,7 +277,7 @@ class MoeLayer(nn.Layer):
             exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
             experts_list.append(exp_layer)
         
-        moeLayer = MoeLayer(d_model = d_model,
+        moeLayer = MoELayer(d_model = d_model,
                             experts=experts_list,
                             gate=gate_config,
                             moe_group=moe_group,
@@ -295,7 +293,7 @@ class MoeLayer(nn.Layer):
                  moe_group=None,
                  mp_group=None,
                  **kwargs):
-        super(MoeLayer, self).__init__()
+        super(MoELayer, self).__init__()
 
         recompute_interval = kwargs.get("recompute_interval", 0)
 
@@ -385,7 +383,7 @@ class MoeLayer(nn.Layer):
             temp_pos = pos
         assert topk == self.top_k
 
-        x = MOEScatter.apply(inp, temp_pos, local_expert_count,
+        x = MoEScatter.apply(inp, temp_pos, local_expert_count,
                              global_expert_count, fwd_batch_size,
                              self.world_size, self.group)
 
@@ -416,7 +414,7 @@ class MoeLayer(nn.Layer):
         if len(gate.shape) == 2:
             out_batch_size *= gate.shape[1]
 
-        x = MOEGather.apply(x, pos, local_expert_count, global_expert_count,
+        x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
                             out_batch_size, self.world_size, self.group)
 
         x = x.reshape([-1, self.top_k, d_model])
-- 
GitLab