diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py index 712c0a7cde3845d81890e068fb426aa892b987c7..290dc96b6342d959e1666f2ff90dda2f88c11844 100644 --- a/python/paddle/fluid/layers/collective.py +++ b/python/paddle/fluid/layers/collective.py @@ -298,12 +298,12 @@ class DistributedClassifier(object): return avg_loss - def arcmargin_classify(self, - x, - label, - margin=0.5, - logit_scale=64, - param_attr=None): + def arcface_classify(self, + x, + label, + margin=0.5, + logit_scale=64, + param_attr=None): ''' reference: ArcFace. https://arxiv.org/abs/1801.07698 ''' @@ -362,72 +362,126 @@ class DistributedClassifier(object): return avg_loss -def distributed_fc_classify(x, - label, - class_num, - nranks, - rank_id, - param_attr=None, - use_bias=True, - name='dist_fc'): +def _distributed_fc_classify(x, + label, + class_num, + nranks, + rank_id, + param_attr=None, + use_bias=True, + name=None): ''' + Classification layer with FC, softmax and cross entropy calculation of + distibuted version in case of too large number of classes. + + Args: + x (Variable): The feature representation of the input samples. This + feature will be flattened into 2-D tensor from dimension index + 1. E.g. [32, 1024, 1, 1] will be flattened to [32, 1024]. + label (Variable): The label corresponding to the input samples. + class_num (integer): The number of classes of the classification problem. + nranks (integer): The number of ranks of distributed trainers. + rank_id (integer): The rank index of the current trainer. + param_attr (ParamAttr, default None): The parameter attribute for + learnable distributed parameters/weights of this layer. + use_bias (float, default 64.0): The scale factor for logit value + of cosine range. + name (str, default None): The name of this layer. + Returns: + Variable: The ArcFace loss. + + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + input = fluid.layers.data(name="input", + shape=[32, 1024], + dtype='float32', + append_batch_size=False) + label = fluid.layers.data(name="label", + shape=[32, 1], + dtype='int64', + append_batch_size=False) + y = fluid.layers.collective.distributed_fc_classify(x=input, + label=label, + class_num=1000, + nranks=8, + rank_id=0) ''' + + if name is None: + name = 'dist_fc' helper = LayerHelper(name, **locals()) classifier = DistributedClassifier(class_num, nranks, rank_id, helper) return classifier.fc_classify(x, label, param_attr, use_bias) -def distributed_arcmargin_classify(x, - label, - class_num, - nranks, - rank_id, - margin=0.5, - logit_scale=64, - param_attr=None, - name='dist_fc'): +def _distributed_arcface_classify(x, + label, + class_num, + nranks, + rank_id, + margin=0.5, + logit_scale=64.0, + param_attr=None, + name=None): ''' + Classification layer with ArcFace loss of distibuted version in case of + too large number of classes. the equation is + + .. math:: + + L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(\theta_{y_i}+m))}}{e^{s(cos(\theta_{y_i}+m))}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}} + + where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and + the representation of class :math: `i`. The details of ArcFace loss + could be referred to https://arxiv.org/abs/1801.07698. + + Args: + x (Variable): The feature representation of the input samples. This + feature will be flattened into 2-D tensor from dimension index + 1. E.g. [32, 1024, 1, 1] will be flattened to [32, 1024]. + label (Variable): The label corresponding to the input samples. + class_num (integer): The number of classes of the classification problem. + nranks (integer): The number of ranks of distributed trainers. + rank_id (integer): The rank index of the current trainer. + margin (float, default 0.5): The angular margin penalty to enhance + the intra-class compactness and inter-class discrepancy. + logit_scale (float, default 64.0): The scale factor for logit value + of cosine range. + param_attr (ParamAttr, default None): The parameter attribute for + learnable distributed parameters/weights of this layer. + name (str, default None): The name of this layer. + Returns: + Variable: The ArcFace loss. + + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + input = fluid.layers.data(name="input", + shape=[32, 1024], + dtype='float32', + append_batch_size=False) + label = fluid.layers.data(name="label", + shape=[32, 1], + dtype='int64', + append_batch_size=False) + y = fluid.layers.collective.distributed_arcface_classify(x=input, + label=label, + class_num=1000, + nranks=8, + rank_id=0) ''' + if name is None: + name = 'dist_fc' helper = LayerHelper(name, **locals()) classifier = DistributedClassifier(class_num, nranks, rank_id, helper) - return classifier.arcmargin_classify( + return classifier.arcface_classify( x=x, label=label, margin=margin, logit_scale=logit_scale, param_attr=param_attr) - - -def distributed_fc(x, - out_dim, - nranks, - rank_id, - param_attr=None, - use_bias=True, - name='dist_fc'): - ''' - ''' - helper = LayerHelper(name, **locals()) - classifier = DistributedClassifier(out_dim, nranks, rank_id, helper) - weight, bias = classifier.create_parameter( - dtype=x.dtype, - in_dim=x.shape[-1], - param_attr=param_attr, - use_bias=use_bias) - x_all = _c_allgather(x, nranks=self.nranks, use_calc_stream=True) - label_all = _c_allgather(label, nranks=self.nranks, use_calc_stream=True) - - shard_fc = nn.mul(x_all, weight) - if use_bias: - shard_fc = nn.elementwise_add(shard_fc, bias) - - # sample code - #if not classifier.is_equal_division: - # shard_fc = nn.pad(shard_fc) - #fc = _c_slice_allgather(shard_fc, - # nranks=nranks, - # rank_id=rank_id) - #if not classifier.is_equal_division: - # fc = nn.depad(fc) - #return fc - raise NotImplementedError('distributed_fc') diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index e8d9dad8c4008f09a6edc44a970781b47db99ba6..af82cd451acc7ccb19e0c4fd8a7e14bfaa1d9e38 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -376,6 +376,8 @@ class LocalSGD(Collective): class DistributedClassificationOptimizer(object): ''' + A optimizer wrapper to generate backward network for distributed + classification training of model parallelism. ''' def __init__(self, optimizer, batch_size):