diff --git a/imperative/python/megengine/data/dataset/vision/mnist.py b/imperative/python/megengine/data/dataset/vision/mnist.py index 89549c2f7d7d1916d95c13149ad2fad5f04741f1..e2b24638a146f5411a30b7242e558ac235020280 100644 --- a/imperative/python/megengine/data/dataset/vision/mnist.py +++ b/imperative/python/megengine/data/dataset/vision/mnist.py @@ -15,50 +15,73 @@ logger = get_logger(__name__) class MNIST(VisionDataset): - r""":class:`~.Dataset` for MNIST meta data.""" - - url_path = "http://yann.lecun.com/exdb/mnist/" - """ - Url prefix for downloading raw file. + r"""MNIST dataset. + The MNIST_ database (Modified National Institute of Standards and Technology database) + is a large database of handwritten digits that is commonly used for training various image processing systems. + The database is also widely used for training and testing in the field of machine learning. + It was created by "re-mixing" the samples from `NIST`_'s original datasets. + Furthermore, the black and white images from NIST were normalized to fit into a 28x28 pixel + bounding box and anti-aliased, which introduced grayscale levels. + The MNIST database contains 60,000 training images and 10,000 testing images. + + The above introduction comes from `MNIST database - Wikipedia + `_. + + Args: + root: Path for MNIST dataset downloading or loading. If it's ``None``, + it will be set to ``~/.cache/megengine`` (the default root path). + train: If ``True``, use traning dataset; Otherwise use the test set. + download: If ``True``, downloads the dataset from the internet and puts it in ``root`` directory. + If dataset is already downloaded, it is not downloaded again. + + Returns: + The MNIST :class:`~.Dataset` that can work with :class:`~.DataLoader`. + + Example: + + >>> from megengine.data.dataset import MNIST # doctest: +SKIP + >>> mnist = MNIST("/data/datasets/MNIST") # Set the root path # doctest: +SKIP + >>> image, label = mnist[0] # doctest: +SKIP + >>> image.shape # doctest: +SKIP + (28, 28, 1) + + .. versionchanged:: 1.11 The original URL has been updated to a mirror URL + + *"Please refrain from accessing these files from automated scripts with high frequency. Make copies!"* + As requested by the original provider of the MNIST dataset, + now the dataset will be downloaded from the mirror site: + https://ossci-datasets.s3.amazonaws.com/mnist/ + + .. seealso:: + + * MNIST dataset is used in :ref:`megengine-quick-start` tutorial as an example. + * You can find a lot of machine learning projects using MNIST dataset on the internet. + + .. _MNIST: http://yann.lecun.com/exdb/mnist/ + .. _NIST: https://www.nist.gov/data """ + + url_path = "https://ossci-datasets.s3.amazonaws.com/mnist/" + raw_file_name = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz", ] - """ - Raw file names of both training set and test set (10k). - """ + raw_file_md5 = [ "f68b3c2dcbeaaa9fbdd348bbdeb94873", "d53e105ee54ea40749a09fcbcd1e9432", "9fb629c4189551a2d022fa330f9573f3", "ec29112dd5afa0611ce80d1b7f02629c", ] - """ - Md5 for checking raw files. - """ def __init__( - self, - root: str = None, - train: bool = True, - download: bool = True, - timeout: int = 500, + self, root: str = None, train: bool = True, download: bool = True, ): - r""" - :param root: path for mnist dataset downloading or loading, if ``None``, - set ``root`` to the ``_default_root``. - :param train: if ``True``, loading trainingset, else loading test set. - :param download: if raw files do not exists and download sets to ``True``, - download raw files and process, otherwise raise ValueError, default is True. - - """ super().__init__(root, order=("image", "image_category")) - self.timeout = timeout - # process the root path if root is None: self.root = self._default_root