diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc index 522c3c111677433a78e8f0418161f7519e9ac29b..51237f58cd17de4a376170e044ec556a2cdc1634 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/dataset_op.cc @@ -387,6 +387,9 @@ uint32_t DatasetOp::GenerateCRC(const std::shared_ptr<DatasetOp> &op) { ss_str = std::regex_replace(ss_str, std::regex("Num workers.*\n"), ""); ss_str = std::regex_replace(ss_str, std::regex("\\[workers.*\\]"), ""); + // Filter out Number of rows when generating the check sum + ss_str = std::regex_replace(ss_str, std::regex("Number of rows.*\n"), ""); + // Filter out the Operator control flags field when generating the check sum ss_str = std::regex_replace(ss_str, std::regex("Operator control flags.*\n"), ""); diff --git a/mindspore/dataset/engine/datasets.py b/mindspore/dataset/engine/datasets.py index 3319c7d53fb9e96ce333456cee1cb84d9ff7eddc..93c5f7743c95bcc8fc4510a8b34c14a99a5034f3 100644 --- a/mindspore/dataset/engine/datasets.py +++ b/mindspore/dataset/engine/datasets.py @@ -435,7 +435,8 @@ class Dataset: parallel (default=None, the value from the config will be used). python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This option could be beneficial if the python operation is computational heavy (default=False). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Returns: MapDataset, dataset after mapping operation. @@ -1951,7 +1952,9 @@ class MapDataset(DatasetOp): in parallel (default=None). python_multiprocessing (bool, optional): Parallelize python operations with multiple worker process. This option could be beneficial if the python operation is computational heavy (default=False). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. + Raises: ValueError: If len(input_columns) != len(output_columns) and columns_order is not specified. @@ -2141,6 +2144,7 @@ class RepeatDataset(DatasetOp): """ return self.count + class SkipDataset(DatasetOp): """ The result of applying Skip operator to the input Dataset. @@ -2406,6 +2410,7 @@ class TransferDataset(DatasetOp): def stop_send(self): self.iterator.depipeline.StopSend() + class RangeDataset(MappableDataset): """ A source dataset that reads and parses datasets stored on disk in a range. @@ -2552,7 +2557,8 @@ class ImageFolderDatasetV2(MappableDataset): into (default=None). shard_id (int, optional): The shard ID within num_shards (default=None). This argument should be specified only when num_shards is also specified. - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Raises: RuntimeError: If sampler and shuffle are specified at the same time. @@ -3348,7 +3354,8 @@ class TFRecordDataset(SourceDataset): argument should be specified only when num_shards is also specified. shard_equal_rows (bool): Get equal rows for all shards(default=False). If shard_equal_rows is false, number of rows of each shard may be not equal. - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. Examples: >>> import mindspore.dataset as ds >>> import mindspore.common.dtype as mstype @@ -3919,7 +3926,8 @@ class RandomDataset(SourceDataset): num_samples (int): number of samples to draw from the total. (default=None, which means all rows) num_parallel_workers (int, optional): number of workers to read the data (default=None, number set in the config). - cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used) + cache (DatasetCache, optional): Tensor cache to use. (default=None which means no cache is used). + The cache feature is under development and is not recommended. shuffle (bool, optional): Whether or not to perform shuffle on the dataset (default=None, expected order behavior shown in the table). num_shards (int, optional): Number of shards that the dataset should be divided @@ -5313,6 +5321,7 @@ class BuildVocabDataset(DatasetOp): return new_op + class BuildSentencePieceVocabDataset(DatasetOp): """ Build a SentencePieceVocab from a dataset. diff --git a/tests/ut/python/dataset/test_cache_map.py b/tests/ut/python/dataset/test_cache_map.py index 5de969db6df7f74b4a6370f83b9f742c5733620a..154a4208a009042acb50cd604421c4bd4f26db25 100644 --- a/tests/ut/python/dataset/test_cache_map.py +++ b/tests/ut/python/dataset/test_cache_map.py @@ -24,6 +24,7 @@ DATA_DIR = "../data/dataset/testImageNetData/train/" GENERATE_GOLDEN = False + def test_cache_map_basic1(): """ Test mappable leaf with cache op right over the leaf @@ -104,11 +105,36 @@ def test_cache_map_basic3(): decode_op = c_vision.Decode() ds1 = ds1.repeat(4) ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache) - print("ds1.dataset_size is ", ds1.get_dataset_size()) + logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) + + num_iter = 0 + for _ in ds1.create_dict_iterator(): + logger.info("get data from dataset") + num_iter += 1 + + logger.info("Number of data in ds1: {} ".format(num_iter)) + assert num_iter == 8 + logger.info('test_cache_basic3 Ended.\n') + +def test_cache_map_basic4(): + """ + Test different rows result in core dump + """ + logger.info("Test cache basic 4") + some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True) + + # This DATA_DIR only has 2 images in it + ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache) + decode_op = c_vision.Decode() + ds1 = ds1.repeat(4) + ds1 = ds1.map(input_columns=["image"], operations=decode_op) + logger.info("ds1.dataset_size is ", ds1.get_dataset_size()) + shape = ds1.output_shapes() + logger.info(shape) num_iter = 0 for _ in ds1.create_dict_iterator(): - print("get data from dataset") + logger.info("get data from dataset") num_iter += 1 logger.info("Number of data in ds1: {} ".format(num_iter)) @@ -152,12 +178,15 @@ def test_cache_map_failure1(): assert num_iter == 0 logger.info('test_cache_failure1 Ended.\n') + if __name__ == '__main__': test_cache_map_basic1() - print("test_cache_map_basic1 success.") + logger.info("test_cache_map_basic1 success.") test_cache_map_basic2() - print("test_cache_map_basic2 success.") + logger.info("test_cache_map_basic2 success.") test_cache_map_basic3() - print("test_cache_map_basic3 success.") + logger.info("test_cache_map_basic3 success.") + test_cache_map_basic4() + logger.info("test_cache_map_basic3 success.") test_cache_map_failure1() - print("test_cache_map_failure1 success.") + logger.info("test_cache_map_failure1 success.")