sampler.py 9.6 KB
Newer Older
L
lifuchen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15 16 17 18 19 20 21 22 23 24
"""
At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.

This suffices for a sampler. We implemente sampler as iterable of valid indices. By valid, we mean 0 <= index < N, where N is the length of the dataset. We then collect several indices within a batch and use it to collect examples from the dataset with __getitem__. Then collate this examples to form a batch.

So the sampler is only responsible for generating valid indices.
"""

import numpy as np
import random

L
lifuchen 已提交
25

26 27 28 29 30 31 32 33 34 35 36 37 38
class Sampler(object):
    def __init__(self, data_source):
        pass

    def __iter__(self):
        # return a iterator of indices
        # or a iterator of list[int], for BatchSampler
        raise NotImplementedError


class SequentialSampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
L
lifuchen 已提交
39

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
    def __iter__(self):
        return iter(range(len(self.data_source)))

    def __len__(self):
        return len(self.data_source)


class RandomSampler(Sampler):
    def __init__(self, data_source, replacement=False, num_samples=None):
        self.data_source = data_source
        self.replacement = replacement
        self._num_samples = num_samples

        if not isinstance(self.replacement, bool):
            raise ValueError("replacement should be a boolean value, but got "
                             "replacement={}".format(self.replacement))

        if self._num_samples is not None and not replacement:
L
lifuchen 已提交
58 59 60
            raise ValueError(
                "With replacement=False, num_samples should not be specified, "
                "since a random permutation will be performed.")
61 62 63

        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
L
lifuchen 已提交
64 65
                             "value, but got num_samples={}".format(
                                 self.num_samples))
66 67 68 69 70 71 72 73 74 75 76

    @property
    def num_samples(self):
        # dataset size might change at runtime
        if self._num_samples is None:
            return len(self.data_source)
        return self._num_samples

    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
L
lifuchen 已提交
77 78 79
            return iter(
                np.random.randint(
                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
80 81 82
        return iter(np.random.permutation(n).tolist())

    def __len__(self):
C
chenfeiyu 已提交
83
        return self.num_samples
84 85 86 87 88 89 90 91 92 93 94 95


class SubsetRandomSampler(Sampler):
    r"""Samples elements randomly from a given list of indices, without replacement.
    Arguments:
        indices (sequence): a sequence of indices
    """

    def __init__(self, indices):
        self.indices = indices

    def __iter__(self):
L
lifuchen 已提交
96 97
        return (self.indices[i]
                for i in np.random.permutation(len(self.indices)))
98 99 100 101 102 103 104 105 106 107 108 109

    def __len__(self):
        return len(self.indices)


class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
    """Partially randmoized sampler, implemented as a example sampler
    1. Sort by lengths
    2. Pick a small patch and randomize it
    3. Permutate mini-batchs
    """

L
lifuchen 已提交
110 111 112 113
    def __init__(self,
                 lengths,
                 batch_size=4,
                 batch_group_size=None,
114
                 permutate=True):
L
lifuchen 已提交
115 116 117
        _lengths = np.array(
            lengths,
            dtype=np.int64)  # maybe better implement length as a sort key
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
        self.lengths = np.sort(_lengths)
        self.sorted_indices = np.argsort(_lengths)

        self.batch_size = batch_size
        if batch_group_size is None:
            batch_group_size = min(batch_size * 32, len(self.lengths))
            if batch_group_size % batch_size != 0:
                batch_group_size -= batch_group_size % batch_size

        self.batch_group_size = batch_group_size
        assert batch_group_size % batch_size == 0
        self.permutate = permutate

    def __iter__(self):
        indices = np.copy(self.sorted_indices)
        batch_group_size = self.batch_group_size
        s, e = 0, 0
        for i in range(len(indices) // batch_group_size):
            s = i * batch_group_size
            e = s + batch_group_size
L
lifuchen 已提交
138
            random.shuffle(indices[s:e])  # inplace
139 140 141 142 143

        # Permutate batches
        if self.permutate:
            perm = np.arange(len(indices[:e]) // self.batch_size)
            random.shuffle(perm)
L
lifuchen 已提交
144 145
            indices[:e] = indices[:e].reshape(
                -1, self.batch_size)[perm, :].reshape(-1)
146 147 148 149 150 151

        # Handle last elements
        s += batch_group_size
        #print(indices)
        if s < len(indices):
            random.shuffle(indices[s:])
L
lifuchen 已提交
152

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
        return iter(indices)

    def __len__(self):
        return len(self.sorted_indices)


class WeightedRandomSampler(Sampler):
    r"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights).
    Args:
        weights (sequence)   : a sequence of weights, not necessary summing up to one
        num_samples (int): number of samples to draw
        replacement (bool): if ``True``, samples are drawn with replacement.
            If not, they are drawn without replacement, which means that when a
            sample index is drawn for a row, it cannot be drawn again for that row.
    Example:
        >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
        [0, 0, 0, 1, 0]
        >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
        [0, 1, 4, 3, 2]
    """

    def __init__(self, weights, num_samples, replacement):
        if not isinstance(num_samples, int) or num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
L
lifuchen 已提交
177 178
                             "value, but got num_samples={}".format(
                                 num_samples))
179 180 181 182 183
        self.weights = np.array(weights, dtype=np.float64)
        self.num_samples = num_samples
        self.replacement = replacement

    def __iter__(self):
L
lifuchen 已提交
184 185 186 187 188 189
        return iter(
            np.random.choice(
                len(self.weights),
                size=(self.num_samples, ),
                replace=self.replacement,
                p=self.weights).tolist())
190 191 192 193 194

    def __len__(self):
        return self.num_samples


K
Kexin Zhao 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
class DistributedSampler(Sampler):
    def __init__(self, dataset_size, num_trainers, rank, shuffle=True):
        self.dataset_size = dataset_size
        self.num_trainers = num_trainers
        self.rank = rank
        self.num_samples = int(np.ceil(dataset_size / num_trainers))
        self.total_size = self.num_samples * num_trainers
        assert self.total_size >= self.dataset_size
        self.shuffle = shuffle

    def __iter__(self):
        indices = list(range(self.dataset_size))
        if self.shuffle:
            random.shuffle(indices)

        # Append extra samples to make it evenly distributed on all trainers.
        indices += indices[:(self.total_size - self.dataset_size)]
        assert len(indices) == self.total_size

        # Subset samples for each trainer.
        indices = indices[self.rank:self.total_size:self.num_trainers]
L
lifuchen 已提交
216
        assert len(indices) == self.num_samples
K
Kexin Zhao 已提交
217 218 219 220 221 222 223

        return iter(indices)

    def __len__(self):
        return self.num_samples


224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
class BatchSampler(Sampler):
    r"""Wraps another sampler to yield a mini-batch of indices.
    Args:
        sampler (Sampler): Base sampler.
        batch_size (int): Size of mini-batch.
        drop_last (bool): If ``True``, the sampler will drop the last batch if
            its size would be less than ``batch_size``
    Example:
        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    """

    def __init__(self, sampler, batch_size, drop_last):
        if not isinstance(sampler, Sampler):
            raise ValueError("sampler should be an instance of "
L
lifuchen 已提交
241
                             "Sampler, but got sampler={}".format(sampler))
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))
        if not isinstance(drop_last, bool):
            raise ValueError("drop_last should be a boolean value, but got "
                             "drop_last={}".format(drop_last))
        self.sampler = sampler
        self.batch_size = batch_size
        self.drop_last = drop_last

    def __iter__(self):
        batch = []
        for idx in self.sampler:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yield batch
                batch = []
        if len(batch) > 0 and not self.drop_last:
            yield batch

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        else:
K
Kexin Zhao 已提交
266
            return (len(self.sampler) + self.batch_size - 1) // self.batch_size