From f92dbfb7ab1216574bb6512b059da36f9ba220ea Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Mon, 18 Apr 2022 18:08:53 +0800 Subject: [PATCH] fix bugs in moe (#41903) * fix moe apis (#41650) * Moe ref (#41836) * moe ref * ref commit * update; document_fix * update;document_fix * Moe ref (#41864) * moe ref * ref commit; document_fix * update; document_fix * update document_fix * update; document_fix --- paddle/fluid/operators/assign_pos_op.cu | 11 ++++++++++- paddle/fluid/operators/limit_by_capacity_op.cu | 8 ++++++++ paddle/fluid/operators/number_count_op.cu | 8 ++++++++ paddle/fluid/operators/prune_gate_by_capacity_op.cu | 8 ++++++++ .../distributed/models/moe/gate/base_gate.py | 7 +++++++ .../distributed/models/moe/gate/gshard_gate.py | 9 ++++++++- .../distributed/models/moe/gate/naive_gate.py | 9 ++++++++- .../distributed/models/moe/gate/switch_gate.py | 9 ++++++++- .../incubate/distributed/models/moe/grad_clip.py | 5 +++++ .../incubate/distributed/models/moe/moe_layer.py | 7 +++++++ .../paddle/incubate/distributed/models/moe/utils.py | 13 +++++++++++-- 11 files changed, 88 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index 5fa159b94f..d96d36931b 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -10,7 +10,16 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the License. */ +limitations under the License. + +The file has been adapted from the two files: + https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cu + https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cuh + Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +We retain the following license from the original files: + Copyright 2021, Jiaao He + Licensed under the Apache License, Version 2.0 (the "License"). +*/ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/assign_pos_op.h" diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 253ae8162c..c77adf2200 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/limit_by_capacity_op.h" diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index 0106c70d8e..923d89c248 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/local_exchange.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/number_count_op.h" diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 953847512b..7228bdbf38 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -11,6 +11,14 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// The file has been adapted from the two files: +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cu +// https://github.com/laekov/fastmoe/blob/master/cuda/balancing.cuh +// Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +// We retain the following license from the original files: +// Copyright 2021, Jiaao He. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/operators/prune_gate_by_capacity_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py index 100d201d4b..f527e82f04 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/base_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/base_gate.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/base_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import paddle.nn as nn diff --git a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py index b1c0cd4214..3618ec56e9 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/gshard_gate.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/gshard_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle @@ -62,6 +69,6 @@ class GShardGate(NaiveGate): if self.random_routing: rand_routing_prob = paddle.rand( shape=[gate_score.shape[0]], dtype="float32") - topk_idx = paddle.distributed.utils.random_routing( + topk_idx = paddle.distributed.models.moe.utils._random_routing( topk_idx, topk_val, rand_routing_prob) return topk_val, topk_idx diff --git a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py index 785d2e971b..c3c6868544 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/naive_gate.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/naive_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). from .base_gate import BaseGate diff --git a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py index 54bf3ab148..776516989e 100644 --- a/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py +++ b/python/paddle/incubate/distributed/models/moe/gate/switch_gate.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/gates/switch_gate.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import math import paddle diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index cde5455d27..b620253b9f 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -55,6 +55,11 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Reference: + https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch + Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 + + Args: clip_norm (float): The maximum norm value. is_expert_param_func (function): a function to decide whether a param should be put into moe_params_grads diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index 99cc38d04b..eebb635e3e 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -11,6 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/layers.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). import collections import math diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py index 99e31a1627..25c76c9753 100644 --- a/python/paddle/incubate/distributed/models/moe/utils.py +++ b/python/paddle/incubate/distributed/models/moe/utils.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,7 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddle.distributed.models.moe.utils import * +# +# The file has been adapted from the file: +# https://github.com/laekov/fastmoe/blob/master/fmoe/functions.py +# Git commit hash: 295a615aacce7e54a37e7935274ba15e901c78e4 +# We retain the following license from the original files: +# Copyright 2021, Jiaao He. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"). + +from paddle.distributed.models.moe.utils import _number_count, _limit_by_capacity, _prune_gate_by_capacity, _assign_pos +import paddle def _alltoall(in_tensor_list, group=None, use_calc_stream=True): -- GitLab