switch_gate.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The file has been adapted from the file:
#     https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py
#     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
# We retain the following license from the original files:
#     Copyright 2021, Jiaao He. All rights reserved.
#   Licensed under the Apache License, Version 2.0 (the "License").

import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from .naive_gate import NaiveGate
from ..utils import limit_by_capacity


class SwitchGate(NaiveGate):
    def __init__(self,
                 d_model,
                 num_expert,
                 world_size,
                 topk=1,
                 switch_eps=.1,
                 capacity=(1.2, 2.4),
                 group=None):
        assert topk == 1, "topk should be 1 in switch"
        super().__init__(d_model, num_expert, world_size, topk=1)
        self.switch_eps = switch_eps
        self.capacity = capacity
        self.group = group

    def forward(self, inp):
        score = self.gate(inp)

        if self.training:
            noise = paddle.rand(shape=score.shape)
            noise = noise * 2 * self.switch_eps + 1.0 - self.switch_eps
            score += noise

        score = F.softmax(score, axis=-1)
        top1_score, top1_idx = paddle.topk(score, k=1, axis=-1, largest=True)

        cap_rate = self.capacity[0 if self.training else 1]
        capacity = math.ceil(cap_rate * inp.shape[0])
        _new_lec, _new_gec, top1_idx = limit_by_capacity(
            top1_idx,
            self.num_expert,
            self.world_size,
            capacity,
            group=self.group)
        valid_idx = top1_idx[top1_idx > -1]
        valid_idx_tmp = paddle.reshape(valid_idx, shape=[len(valid_idx), 1])
        fraction_expert = paddle.scatter_nd_add(
            x=paddle.zeros(shape=[self.tot_expert]),
            index=valid_idx_tmp,
            updates=paddle.ones_like(
                valid_idx, dtype=paddle.float32).reshape(
                    shape=[len(valid_idx)]), ) / valid_idx.numel()
        prob_expert = score.sum(axis=0) / valid_idx.numel()
        loss = (fraction_expert * prob_expert).sum() * self.tot_expert
        self.set_loss(loss)

        return top1_score, top1_idx