limit_by_capacity_op.cu 3.2 KB
Newer Older
R
Roc 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
R
Roc 已提交
14 15 16 17 18 19 20 21
//
// The file has been adapted from the two files:
//     https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cu
//     https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cuh
//     Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4
// We retain the following license from the original files:
//      Copyright 2021, Jiaao He. All rights reserved.
//  Licensed under the Apache License, Version 2.0 (the "License").
R
Roc 已提交
22 23

#include "paddle/fluid/operators/limit_by_capacity_op.h"
24
#include "paddle/fluid/framework/op_registry.h"
R
Roc 已提交
25
#include "paddle/fluid/platform/float16.h"
26
#include "paddle/phi/backends/gpu/gpu_primitives.h"
R
Roc 已提交
27 28 29 30 31

namespace paddle {
namespace operators {

template <typename T>
32 33
__global__ void limit_by_capacity_impl(
    const T* expc, T* cap, T* out, const int n_expert, const int n_worker) {
R
Roc 已提交
34 35 36 37
  int eid, wid;
  CUDA_KERNEL_LOOP(i, (n_expert * n_worker)) {
    wid = i / n_expert;
    eid = i % n_expert;
R
Roc 已提交
38
    auto proposal = expc[wid * n_expert + eid];
39
    auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
R
Roc 已提交
40 41 42 43 44 45 46 47 48 49
    if (cap_left >= proposal) {
      out[wid * n_expert + eid] = proposal;
    } else if (cap_left >= 0) {
      out[wid * n_expert + eid] = cap_left;
    } else {
      out[wid * n_expert + eid] = 0;
    }
  }
}

H
huangjiyi 已提交
50
template <typename T, typename DeviceContext>
R
Roc 已提交
51 52 53
class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
54 55
    auto expert_count = context.Input<phi::DenseTensor>("expert_count");
    auto capacity = context.Input<phi::DenseTensor>("capacity");
R
Roc 已提交
56
    auto n_worker = context.Attr<int>("n_worker");
57
    auto out = context.Output<phi::DenseTensor>("Out");
R
Roc 已提交
58 59 60

    auto n_expert = expert_count->numel() / n_worker;
    const auto place = context.GetPlace();
L
Leo Chen 已提交
61
    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
R
Roc 已提交
62

R
Roc 已提交
63
    dim3 grid_dim(256);
R
Roc 已提交
64 65 66 67
    dim3 block_dim(1024);
    auto out_data = out->mutable_data<T>(place);
    const T* ec_data = expert_count->data<T>();

68
    phi::DenseTensor capacity_copy;
R
Roc 已提交
69 70 71 72 73 74 75 76 77 78 79 80
    framework::TensorCopy(*capacity, place, dev_ctx, &capacity_copy);
    T* cap_data = capacity_copy.mutable_data<T>(place);

    limit_by_capacity_impl<T><<<grid_dim, block_dim, 0, dev_ctx.stream()>>>(
        ec_data, cap_data, out_data, n_expert, n_worker);
  }
};

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
H
huangjiyi 已提交
81 82 83 84 85
PD_REGISTER_STRUCT_KERNEL(limit_by_capacity,
                          GPU,
                          ALL_LAYOUT,
                          ops::LimitByCapacityOpCUDAKernel,
                          int64_t) {}