remove old dataloader & generator from quantilization (#55754)

* remove old dataloader & generator from quantilization * fix ut test_post_training_quantization_mnist

remove old dataloader & generator from quantilization (#55754)
* remove old dataloader & generator from quantilization * fix ut test_post_training_quantization_mnist
e2e0d296 · JYChen · GitHub · 3e2c6a56 · e2e0d296 · e2e0d296
3 changed file
--- a/python/paddle/static/quantization/post_training_quantization.py
+++ b/python/paddle/static/quantization/post_training_quantization.py
@@ -23,12 +23,10 @@ try:
 except:
    from .utils import tqdm
-from inspect import isgeneratorfunction
 from paddle.fluid.framework import IrGraph, _get_var
 from ... import io, static
-from ...fluid import reader
 from ...framework import core
 from ...utils import unique_name
 from ..log_helper import get_logger
@@ -171,16 +169,16 @@ class PostTrainingQuantization:
                When all parameters were saved in a single binary file, set it
                as the real filename. If parameters were saved in separate files,
                set it as 'None'. Default is 'None'.
-            batch_generator(Python Generator): The batch generator provides
+            batch_generator(Python Generator, depreceated): The batch generator provides
                calibrate data for DataLoader, and it returns a batch every
                time. Note that, sample_generator and batch_generator, only one
                should be set. Beisdes, batch_generator supports lod tensor.
-            sample_generator(Python Generator): The sample generator provides
+            sample_generator(Python Generator, depreceated): The sample generator provides
                calibrate data for DataLoader, and it only returns a sample every
                time. Note that, sample_generator and batch_generator, only one
                should be set. Beisdes, sample_generator dose not support lod tensor.
-            data_loader(Python Generator, Paddle.io.DataLoader, optional): The
+            data_loader(Paddle.io.DataLoader): The
-                Generator or Dataloader provides calibrate data, and it could
+                Dataloader provides calibrate data, and it could
                return a batch every time.
            batch_size(int, optional): The batch size of DataLoader. Default is 10.
            batch_nums(int, optional): If batch_nums is not None, the number of
@@ -309,22 +307,12 @@ class PostTrainingQuantization:
        # Check inputs
        assert executor is not None, "The executor cannot be None."
-        assert any(
+        assert data_loader is not None, "data_loader cannot be None."
-            [gen is not None]
-            for gen in [sample_generator, batch_generator, data_loader]
+        assert isinstance(
-        ), (
+            data_loader, io.DataLoader
-            "The sample_generator, batch_generator "
+        ), "data_loader only accepts `paddle.io.DataLoader`."
-            "and data_loader cannot be None in the same time."
-        )
-        if data_loader is not None:
-            assert isinstance(
-                data_loader,
-                (
-                    io.DataLoader,
-                    type(isgeneratorfunction),
-                    reader.GeneratorLoader,
-                ),
-            ), "data_loader only accepts `paddle.io.DataLoader` or Generator instance."
        assert batch_size > 0, "The batch_size should be greater than 0."
        assert (
            algo in self._support_algo_type
@@ -615,29 +603,8 @@ class PostTrainingQuantization:
            for var_name in self._feed_list
        ]
-        if self._data_loader is not None:
-            self._batch_nums = (
-                self._batch_nums if self._batch_nums else len(self._data_loader)
-            )
-            return
-        self._data_loader = reader.DataLoader.from_generator(
-            feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True
-        )
-        if self._sample_generator is not None:
-            self._data_loader.set_sample_generator(
-                self._sample_generator,
-                batch_size=self._batch_size,
-                drop_last=True,
-                places=self._place,
-            )
-        elif self._batch_generator is not None:
-            self._data_loader.set_batch_generator(
-                self._batch_generator, places=self._place
-            )
        self._batch_nums = (
-            self._batch_nums
+            self._batch_nums if self._batch_nums else len(self._data_loader)
-            if self._batch_nums
-            else len(list(self._data_loader))
        )
    def _optimize_fp32_model(self):

--- a/test/quantization/test_post_training_quantization_mnist.py
+++ b/test/quantization/test_post_training_quantization_mnist.py
@@ -30,6 +30,23 @@ random.seed(0)
 np.random.seed(0)
+class TransedMnistDataSet(paddle.io.Dataset):
+    def __init__(self, mnist_data):
+        self.mnist_data = mnist_data
+    def __getitem__(self, idx):
+        img = (
+            np.array(self.mnist_data[idx][0])
+            .astype('float32')
+            .reshape(1, 28, 28)
+        )
+        batch = img / 127.5 - 1.0
+        return {"img": batch}
+    def __len__(self):
+        return len(self.mnist_data)
 class TestPostTrainingQuantization(unittest.TestCase):
    def setUp(self):
        self.root_path = tempfile.TemporaryDirectory()
@@ -217,14 +234,27 @@ class TestPostTrainingQuantization(unittest.TestCase):
    ):
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
-        val_reader = paddle.dataset.mnist.train()
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode='train', transform=None
+        )
+        train_dataset = TransedMnistDataSet(train_dataset)
+        BatchSampler = paddle.io.BatchSampler(
+            train_dataset, batch_size=batch_size
+        )
+        val_data_generator = paddle.io.DataLoader(
+            train_dataset,
+            batch_sampler=BatchSampler,
+            places=paddle.static.cpu_places(),
+        )
        ptq = PostTrainingQuantization(
            executor=exe,
            model_dir=model_path,
            model_filename=model_filename,
            params_filename=params_filename,
-            sample_generator=val_reader,
+            sample_generator=None,
+            data_loader=val_data_generator,
            batch_size=batch_size,
            batch_nums=batch_nums,
            algo=algo,

--- a/test/quantization/test_post_training_quantization_while.py
+++ b/test/quantization/test_post_training_quantization_while.py
@@ -29,6 +29,23 @@ random.seed(0)
 np.random.seed(0)
+class TransedMnistDataSet(paddle.io.Dataset):
+    def __init__(self, mnist_data):
+        self.mnist_data = mnist_data
+    def __getitem__(self, idx):
+        img = (
+            np.array(self.mnist_data[idx][0])
+            .astype('float32')
+            .reshape(1, 28, 28)
+        )
+        batch = img / 127.5 - 1.0
+        return {"x": batch}
+    def __len__(self):
+        return len(self.mnist_data)
 class TestPostTrainingQuantization(unittest.TestCase):
    def setUp(self):
        self.download_path = 'int8/download'
@@ -132,28 +149,30 @@ class TestPostTrainingQuantization(unittest.TestCase):
        is_optimize_model=False,
        batch_size=10,
        batch_nums=10,
-        is_data_loader=False,
    ):
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
-        val_reader = paddle.dataset.mnist.train()
-        def val_data_generator():
+        train_dataset = paddle.vision.datasets.MNIST(
-            batches = []
+            mode='train', transform=None
-            for data in val_reader():
+        )
-                batches.append(data[0].reshape(1, 28, 28))
+        train_dataset = TransedMnistDataSet(train_dataset)
-                if len(batches) == batch_size:
+        BatchSampler = paddle.io.BatchSampler(
-                    batches = np.asarray(batches)
+            train_dataset, batch_size=batch_size
-                    yield {"x": batches}
+        )
-                    batches = []
+        val_data_generator = paddle.io.DataLoader(
+            train_dataset,
+            batch_sampler=BatchSampler,
+            places=paddle.static.cpu_places(),
+        )
        ptq = PostTrainingQuantization(
            executor=exe,
            model_dir=model_path,
            model_filename='model.pdmodel',
            params_filename='model.pdiparams',
-            sample_generator=val_reader if not is_data_loader else None,
+            sample_generator=None,
-            data_loader=val_data_generator if is_data_loader else None,
+            data_loader=val_data_generator,
            batch_size=batch_size,
            batch_nums=batch_nums,
            algo=algo,
@@ -183,7 +202,6 @@ class TestPostTrainingQuantization(unittest.TestCase):
        batch_size=10,
        infer_iterations=10,
        quant_iterations=5,
-        is_data_loader=False,
    ):
        origin_model_path = self.download_model(data_url, data_md5, model_name)
@@ -210,7 +228,6 @@ class TestPostTrainingQuantization(unittest.TestCase):
            is_optimize_model,
            batch_size,
            quant_iterations,
-            is_data_loader=is_data_loader,
        )
        print(
@@ -442,7 +459,6 @@ class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
            batch_size,
            infer_iterations,
            quant_iterations,
-            is_data_loader=True,
        )