diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index ebc6d1a927c57d61e12a3e3aa0f9b699bbbc5920..253ae8162c9b4e668b5651e7b73b58ade9136b05 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -20,19 +20,17 @@ namespace paddle { namespace operators { -#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1) - using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; template __global__ void limit_by_capacity_impl(const T* expc, T* cap, T* out, const int n_expert, const int n_worker) { - int eid = blockIdx.y; - int wid = blockIdx.x * blockDim.x + threadIdx.x; - if (wid < n_worker) { + int eid, wid; + CUDA_KERNEL_LOOP(i, (n_expert * n_worker)) { + wid = i / n_expert; + eid = i % n_expert; auto proposal = expc[wid * n_expert + eid]; - // int cap_left = atomicSub(cap + eid, proposal); auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); if (cap_left >= proposal) { out[wid * n_expert + eid] = proposal; @@ -54,12 +52,11 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel { auto out = context.Output("Out"); auto n_expert = expert_count->numel() / n_worker; - // std::cout << "n_expert" << n_expert << std::endl; const auto place = context.GetPlace(); const auto& dev_ctx = context.template device_context(); - dim3 grid_dim(CEIL(n_worker, 1024), n_expert); + dim3 grid_dim(256); dim3 block_dim(1024); auto out_data = out->mutable_data(place); const T* ec_data = expert_count->data(); diff --git a/python/paddle/incubate/distributed/models/moe/gate/__init__.py b/python/paddle/incubate/distributed/models/moe/gate/__init__.py index 4dd9205f7c144da51fb66102b7d75da13f11f659..d4bf666eb698eb0052476320a381880c52c08687 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/__init__.py +++ b/python/paddle/incubate/distributed/models/moe/gate/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py index 046051f6b6adbd8358c09bea9f7eb72eac88544d..100d201d4b3d1bdd9f100f8817f30ea8c707eda0 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py index ea441263790aba02bf495e1e7e432cc022d7db1d..b1c0cd4214dbb1b66cea91224fb5e3eaa094b991 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py index ac6164ceace8b153326608df8dfb2525d110fb5a..785d2e971bb3681a50746920d2e04aefdde97242 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py index 94347ea15eb0bee4f897b66f94a155f2918ed2ed..54bf3ab148ab2a7b6ed666de1a67a944f6e108f8 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index 6ee2a30589cddfdd8c6e67891c6eed671f0a2cb8..99cc38d04bdda79201c73a6a1fecf453602a2dca 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -30,8 +30,6 @@ from .utils import count_by_gate from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute from paddle import fluid -__all__ = ["MoeLayer"] - def _local_scatter(inp, pos): if pos.shape != [0]: @@ -71,7 +69,7 @@ def _all_gather(tensor, group=None, use_calc_stream=True): 'ring_id', ring_id, 'nranks', nranks) -class MOEScatter(PyLayer): +class MoEScatter(PyLayer): r""" Scatter input samples from [batch x sequences] to contiguous alone experts. If `world_size` is greater than 1, the samples will first be locally @@ -117,10 +115,10 @@ class MOEScatter(PyLayer): return grad_in, None, None, None -class MOEGather(PyLayer): +class MoEGather(PyLayer): r""" Gather output samples from contiguous alone experts back to [batch x - sequences]. Works symmetrically with MOEScatter. + sequences]. Works symmetrically with MoEScatter. """ @staticmethod @@ -225,8 +223,8 @@ def prepare_forward(gate, num_expert, world_size, moe_group): fwd_batch_size, ) -class MoeLayer(nn.Layer): - """Moe Layer +class MoELayer(nn.Layer): + """MoE Layer Args: d_model: (int) model dimention experts: (nn.LayerList) expert networks list @@ -243,7 +241,7 @@ class MoeLayer(nn.Layer): Examples: .. code-block:: python from paddle.nn import layer, LayerList - from paddle.distributed.moe import Moelayer + from paddle.distributed.moe import MoElayer from paddle.distributed.collective import Group from paddle.distributed import fleet @@ -279,7 +277,7 @@ class MoeLayer(nn.Layer): exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts) experts_list.append(exp_layer) - moeLayer = MoeLayer(d_model = d_model, + moeLayer = MoELayer(d_model = d_model, experts=experts_list, gate=gate_config, moe_group=moe_group, @@ -295,7 +293,7 @@ class MoeLayer(nn.Layer): moe_group=None, mp_group=None, **kwargs): - super(MoeLayer, self).__init__() + super(MoELayer, self).__init__() recompute_interval = kwargs.get("recompute_interval", 0) @@ -385,7 +383,7 @@ class MoeLayer(nn.Layer): temp_pos = pos assert topk == self.top_k - x = MOEScatter.apply(inp, temp_pos, local_expert_count, + x = MoEScatter.apply(inp, temp_pos, local_expert_count, global_expert_count, fwd_batch_size, self.world_size, self.group) @@ -416,7 +414,7 @@ class MoeLayer(nn.Layer): if len(gate.shape) == 2: out_batch_size *= gate.shape[1] - x = MOEGather.apply(x, pos, local_expert_count, global_expert_count, + x = MoEGather.apply(x, pos, local_expert_count, global_expert_count, out_batch_size, self.world_size, self.group) x = x.reshape([-1, self.top_k, d_model])