!909 Adding fix for set seed

Merge pull request !909 from EricZ/test_random

!909 Adding fix for set seed
Merge pull request !909 from EricZ/test_random
79f087c2 · mindspore-ci-bot · Gitee · 200b3f07 · 26cb3e8a · 79f087c2
7 changed file
--- a/mindspore/dataset/core/configuration.py
+++ b/mindspore/dataset/core/configuration.py
@@ -15,7 +15,7 @@
 """
 The configuration manager.
 """
-
+import random
 import mindspore._c_dataengine as cde

 INT32_MAX = 2147483647
@@ -32,6 +32,12 @@ class ConfigurationManager:
        """
        Set the seed to be used in any random generator. This is used to produce deterministic results.

+        Note:
+            This set_seed function sets the seed in the python random library function for deterministic
+            python augmentations using randomness. This set_seed function should be called with every
+            iterator created to reset the random seed. In our pipeline this does not guarantee
+            deterministic results with num_parallel_workers > 1.
+
        Args:
            seed(int): seed to be set

@@ -47,6 +53,7 @@ class ConfigurationManager:
        if seed < 0 or seed > UINT32_MAX:
            raise ValueError("Seed given is not within the required range")
        self.config.set_seed(seed)
+        random.seed(seed)

    def get_seed(self):
        """

--- a/tests/ut/python/dataset/test_config.py
+++ b/tests/ut/python/dataset/test_config.py
@@ -13,14 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 """
-Testing configuration manager 
+Testing configuration manager
 """
 import filecmp
 import glob
+import numpy as np
 import os

+from mindspore import log as logger
+
 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as vision
+import mindspore.dataset.transforms.vision.py_transforms as py_vision
+

 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
 SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
@@ -46,9 +51,17 @@ def test_basic():
    assert ds.config.get_prefetch_size() == 4
    assert ds.config.get_seed() == 5

+
+def test_get_seed():
+    """
+    This gets the seed value without explicitly setting a default, expect int.
+    """
+    assert isinstance(ds.config.get_seed(), int)
+
+
 def test_pipeline():
-    """ 
-    Test that our configuration pipeline works when we set parameters at dataset interval 
+    """
+    Test that our configuration pipeline works when we set parameters at different locations in dataset code
    """
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    ds.config.set_num_parallel_workers(2)
@@ -60,12 +73,12 @@ def test_pipeline():
    data2 = data2.map(input_columns=["image"], operations=[vision.Decode(True)])
    ds.serialize(data2, "testpipeline2.json")

-    # check that the generated output is different 
+    # check that the generated output is different
    assert (filecmp.cmp('testpipeline.json', 'testpipeline2.json'))

-    # this test passes currently because our num_parallel_workers don't get updated. 
+    # this test passes currently because our num_parallel_workers don't get updated.

-    # remove generated jason files 
+    # remove generated jason files
    file_list = glob.glob('*.json')
    for f in file_list:
        try:
@@ -74,6 +87,209 @@ def test_pipeline():
            logger.info("Error while deleting: {}".format(f))


+def test_deterministic_run_fail():
+    """
+    Test RandomCrop with seed, expected to fail
+    """
+    logger.info("test_deterministic_run_fail")
+
+    # when we set the seed all operations within our dataset should be deterministic
+    ds.config.set_seed(0)
+    ds.config.set_num_parallel_workers(1)
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    # Assuming we get the same seed on calling constructor, if this op is re-used then result won't be
+    # the same in between the two datasets. For example, RandomCrop constructor takes seed (0)
+    # outputs a deterministic series of numbers, e,g "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random
+    random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
+    decode_op = vision.Decode()
+    data1 = data1.map(input_columns=["image"], operations=decode_op)
+    data1 = data1.map(input_columns=["image"], operations=random_crop_op)
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=decode_op)
+    # If seed is set up on constructor
+    data2 = data2.map(input_columns=["image"], operations=random_crop_op)
+
+    try:
+        for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+            np.testing.assert_equal (item1["image"], item2["image"])
+
+    except BaseException as e:
+        # two datasets split the number out of the sequence a
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Array" in str(e)
+
+
+def test_deterministic_run_pass():
+    """
+    Test deterministic run with with setting the seed
+    """
+    logger.info("test_deterministic_run_pass")
+    ds.config.set_seed(0)
+    ds.config.set_num_parallel_workers(1)
+
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    # We get the seed when constructor is called
+    random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
+    decode_op = vision.Decode()
+    data1 = data1.map(input_columns=["image"], operations=decode_op)
+    data1 = data1.map(input_columns=["image"], operations=random_crop_op)
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=decode_op)
+    # Since seed is set up on constructor, so the two ops output deterministic sequence.
+    # Assume the generated random sequence "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random
+    random_crop_op2 = vision.RandomCrop([512, 512], [200, 200, 200, 200])
+    data2 = data2.map(input_columns=["image"], operations=random_crop_op2)
+    try:
+        for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+            np.testing.assert_equal (item1["image"], item2["image"])
+    except BaseException as e:
+        # two datasets both use numbers from the generated sequence "a"
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Array" in str(e)
+
+
+def test_seed_undeterministic():
+    """
+    Test seed with num parallel workers in c, this test is expected to fail some of the time
+    """
+    logger.info("test_seed_undeterministic")
+    ds.config.set_seed(0)
+
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    # seed will be read in during constructor call
+    random_crop_op = vision.RandomCrop([512, 512], [200, 200, 200, 200])
+    decode_op = vision.Decode()
+    data1 = data1.map(input_columns=["image"], operations=decode_op)
+    data1 = data1.map(input_columns=["image"], operations=random_crop_op)
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=decode_op)
+    # If seed is set up on constructor, so the two ops output deterministic sequence
+    random_crop_op2 = vision.RandomCrop([512, 512], [200, 200, 200, 200])
+    data2 = data2.map(input_columns=["image"], operations=random_crop_op2)
+
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        np.testing.assert_equal (item1["image"], item2["image"])
+
+
+def test_deterministic_run_distribution():
+    """
+    Test deterministic run with with setting the seed being used in a distribution
+    """
+    logger.info("test_deterministic_run_distribution")
+
+    # when we set the seed all operations within our dataset should be deterministic
+    ds.config.set_seed(0)
+    ds.config.set_num_parallel_workers(1)
+
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    random_crop_op = vision.RandomHorizontalFlip(0.1)
+    decode_op = vision.Decode()
+    data1 = data1.map(input_columns=["image"], operations=decode_op)
+    data1 = data1.map(input_columns=["image"], operations=random_crop_op)
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=decode_op)
+    # If seed is set up on constructor, so the two ops output deterministic sequence
+    random_crop_op2 = vision.RandomHorizontalFlip(0.1)
+    data2 = data2.map(input_columns=["image"], operations=random_crop_op2)
+
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        np.testing.assert_equal (item1["image"], item2["image"])
+
+
+def test_deterministic_python_seed():
+    """
+    Test deterministic execution with seed in python
+    """
+    logger.info("deterministic_random_crop_op_python_2")
+    ds.config.set_seed(0)
+    ds.config.set_num_parallel_workers(1)
+
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomCrop([512, 512], [200, 200, 200, 200]),
+        py_vision.ToTensor(),
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    data1 = data1.map(input_columns=["image"], operations=transform())
+    data1_output = []
+    # config.set_seed() calls random.seed()
+    for data_one in data1.create_dict_iterator():
+        data1_output.append(data_one["image"])
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    data2 = data2.map(input_columns=["image"], operations=transform())
+    # config.set_seed() calls random.seed(), resets seed for next dataset iterator
+    ds.config.set_seed(0)
+
+    data2_output = []
+    for data_two in data2.create_dict_iterator():
+        data2_output.append(data_two["image"])
+
+    np.testing.assert_equal (data1_output, data2_output)
+
+
+def test_deterministic_python_seed_multi_thread():
+    """
+    Test deterministic execution with seed in python, this fails with multi-thread pyfunc run
+    """
+    logger.info("deterministic_random_crop_op_python_2")
+    ds.config.set_seed(0)
+    # when we set the seed all operations within our dataset should be deterministic
+    # First dataset
+    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    transforms = [
+        py_vision.Decode(),
+        py_vision.RandomCrop([512, 512], [200, 200, 200, 200]),
+        py_vision.ToTensor(),
+    ]
+    transform = py_vision.ComposeOp(transforms)
+    data1 = data1.map(input_columns=["image"], operations=transform(), python_multiprocessing=True)
+    data1_output = []
+    # config.set_seed() calls random.seed()
+    for data_one in data1.create_dict_iterator():
+        data1_output.append(data_one["image"])
+
+    # Second dataset
+    data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
+    # If seed is set up on constructor
+    data2 = data2.map(input_columns=["image"], operations=transform(), python_multiprocessing=True)
+    # config.set_seed() calls random.seed()
+    ds.config.set_seed(0)
+
+    data2_output = []
+    for data_two in data2.create_dict_iterator():
+        data2_output.append(data_two["image"])
+
+    try:
+        np.testing.assert_equal (data1_output, data2_output)
+    except BaseException as e:
+        # expect output to not match during multi-threaded excution
+        logger.info("Got an exception in DE: {}".format(str(e)))
+        assert "Array" in str(e)
+
+
 if __name__ == '__main__':
    test_basic()
    test_pipeline()
+    test_deterministic_run_pass()
+    test_deterministic_run_distribution()
+    test_deterministic_run_fail()
+    test_deterministic_python_seed()
+    test_seed_undeterministic()
+    test_get_seed()
--- a/tests/ut/python/dataset/test_datasets_textfileop.py
+++ b/tests/ut/python/dataset/test_datasets_textfileop.py
@@ -36,6 +36,7 @@ def test_textline_dataset_all_file():
    assert(count == 5)

 def test_textline_dataset_totext():
+    ds.config.set_num_parallel_workers(4)
    data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=False)
    count = 0
    line = ["This is a text file.", "Another file.", "Be happy every day.", "End of file.", "Good luck to everyone."]

--- a/tests/ut/python/dataset/test_random_color_adjust.py
+++ b/tests/ut/python/dataset/test_random_color_adjust.py
@@ -37,7 +37,7 @@ def visualize(first, mse, second):

    plt.subplot(142)
    plt.imshow(second)
-    plt.title("py random_color_jitter image")
+    plt.title("py random_color_adjust image")

    plt.subplot(143)
    plt.imshow(first - second)
@@ -50,20 +50,20 @@ def diff_mse(in1, in2):
    return mse * 100


-def test_random_color_jitter_op_brightness():
+def test_random_color_adjust_op_brightness():
    """
    Test RandomColorAdjust op
    """
-    logger.info("test_random_color_jitter_op")
+    logger.info("test_random_color_adjust_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
    decode_op = c_vision.Decode()

-    random_jitter_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0))
+    random_adjust_op = c_vision.RandomColorAdjust((0.8, 0.8), (1, 1), (1, 1), (0, 0))

    ctrans = [decode_op,
-              random_jitter_op,
+              random_adjust_op,
              ]

    data1 = data1.map(input_columns=["image"], operations=ctrans)
@@ -100,20 +100,20 @@ def test_random_color_jitter_op_brightness():
        # visualize(c_image, mse, py_image)


-def test_random_color_jitter_op_contrast():
+def test_random_color_adjust_op_contrast():
    """
    Test RandomColorAdjust op
    """
-    logger.info("test_random_color_jitter_op")
+    logger.info("test_random_color_adjust_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
    decode_op = c_vision.Decode()

-    random_jitter_op = c_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0))
+    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (0.5, 0.5), (1, 1), (0, 0))

    ctrans = [decode_op,
-              random_jitter_op
+              random_adjust_op
              ]

    data1 = data1.map(input_columns=["image"], operations=ctrans)
@@ -156,20 +156,20 @@ def test_random_color_jitter_op_contrast():
        # visualize(c_image, mse, py_image)


-def test_random_color_jitter_op_saturation():
+def test_random_color_adjust_op_saturation():
    """
    Test RandomColorAdjust op
    """
-    logger.info("test_random_color_jitter_op")
+    logger.info("test_random_color_adjust_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
    decode_op = c_vision.Decode()

-    random_jitter_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0))
+    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (0.5, 0.5), (0, 0))

    ctrans = [decode_op,
-              random_jitter_op
+              random_adjust_op
              ]

    data1 = data1.map(input_columns=["image"], operations=ctrans)
@@ -209,20 +209,20 @@ def test_random_color_jitter_op_saturation():
        # visualize(c_image, mse, py_image)


-def test_random_color_jitter_op_hue():
+def test_random_color_adjust_op_hue():
    """
    Test RandomColorAdjust op
    """
-    logger.info("test_random_color_jitter_op")
+    logger.info("test_random_color_adjust_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
    decode_op = c_vision.Decode()

-    random_jitter_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2))
+    random_adjust_op = c_vision.RandomColorAdjust((1, 1), (1, 1), (1, 1), (0.2, 0.2))

    ctrans = [decode_op,
-              random_jitter_op,
+              random_adjust_op,
              ]

    data1 = data1.map(input_columns=["image"], operations=ctrans)
@@ -264,7 +264,7 @@ def test_random_color_jitter_op_hue():


 if __name__ == "__main__":
-    test_random_color_jitter_op_brightness()
-    test_random_color_jitter_op_contrast()
-    test_random_color_jitter_op_saturation()
-    test_random_color_jitter_op_hue()
+    test_random_color_adjust_op_brightness()
+    test_random_color_adjust_op_contrast()
+    test_random_color_adjust_op_saturation()
+    test_random_color_adjust_op_hue()
--- a/tests/ut/python/dataset/test_random_crop.py
+++ b/tests/ut/python/dataset/test_random_crop.py
@@ -17,8 +17,8 @@ Testing RandomCropAndResize op in DE
 """
 import matplotlib.pyplot as plt
 import mindspore.dataset.transforms.vision.c_transforms as vision
-from mindspore import log as logger

+from mindspore import log as logger
 import mindspore.dataset as ds

 DATA_DIR = ["../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"]
@@ -45,9 +45,9 @@ def visualize(a, mse, original):

 def test_random_crop_op():
    """
-    Test RandomCropAndResize op
+    Test RandomCrop Op
    """
-    logger.info("test_random_crop_and_resize_op")
+    logger.info("test_random_crop_op")

    # First dataset
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False)
@@ -67,3 +67,4 @@ def test_random_crop_op():

 if __name__ == "__main__":
    test_random_crop_op()
+
--- a/tests/ut/python/dataset/test_rename.py
+++ b/tests/ut/python/dataset/test_rename.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import numpy as np
 import mindspore.dataset as ds
 from mindspore import log as logger

@@ -34,9 +35,9 @@ def test_rename():

    for i, item in enumerate(data.create_dict_iterator()):
        logger.info("item[mask] is {}".format(item["masks"]))
-        assert item["masks"].all() == item["input_ids"].all()
+        np.testing.assert_equal (item["masks"], item["input_ids"])
        logger.info("item[seg_ids] is {}".format(item["seg_ids"]))
-        assert item["segment_ids"].all() == item["seg_ids"].all()
+        np.testing.assert_equal (item["segment_ids"], item["seg_ids"])
        # need to consume the data in the buffer
        num_iter += 1
    logger.info("Number of data in data: {}".format(num_iter))

--- a/tests/ut/python/dataset/test_shuffle.py
+++ b/tests/ut/python/dataset/test_shuffle.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+import numpy as np
 from util import save_and_check

 import mindspore.dataset as ds
@@ -117,6 +118,27 @@ def test_shuffle_05():
    save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)


+def test_shuffle_06():
+    """
+    Test shuffle: with set seed, both datasets 
+    """
+    logger.info("test_shuffle_06")
+    # define parameters
+    buffer_size = 13
+    seed = 1
+
+    # apply dataset operations
+    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
+    ds.config.set_seed(seed)
+    data1 = data1.shuffle(buffer_size=buffer_size)
+
+    data2 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
+    data2 = data2.shuffle(buffer_size=buffer_size)
+
+    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
+        np.testing.assert_equal (item1, item2)
+
+
 def test_shuffle_exception_01():
    """
    Test shuffle exception: buffer_size<0
@@ -231,6 +253,7 @@ if __name__ == '__main__':
    test_shuffle_03()
    test_shuffle_04()
    test_shuffle_05()
+    test_shuffle_06()
    test_shuffle_exception_01()
    test_shuffle_exception_02()
    test_shuffle_exception_03()