Merge pull request #32 from LielinJiang/transforms

Add transforms

Merge pull request #32 from LielinJiang/transforms
Add transforms
d90f7cc6 · LielinJiang · GitHub · 3c5f0743 · 999b9ff5 · d90f7cc6
18 changed file
--- a/datasets/folder.py
+++ b/datasets/folder.py
@@ -71,14 +71,14 @@ class DatasetFolder(Dataset):

    Args:
        root (string): Root directory path.
-        loader (callable, optional): A function to load a sample given its path.
-        extensions (tuple[string], optional): A list of allowed extensions.
+        loader (callable|optional): A function to load a sample given its path.
+        extensions (tuple[str]|optional): A list of allowed extensions.
            both extensions and is_valid_file should not be passed.
-        transform (callable, optional): A function/transform that takes in
+        transform (callable|optional): A function/transform that takes in
            a sample and returns a transformed version.
-        target_transform (callable, optional): A function/transform that takes
+        target_transform (callable|optional): A function/transform that takes
            in the target and transforms it.
-        is_valid_file (callable, optional): A function that takes path of a file
+        is_valid_file (callable|optional): A function that takes path of a file
            and check if the file is a valid file (used to check of corrupt files)
            both extensions and is_valid_file should not be passed.

@@ -97,6 +97,8 @@ class DatasetFolder(Dataset):
                 target_transform=None,
                 is_valid_file=None):
        self.root = root
+        self.transform = transform
+        self.target_transform = target_transform
        if extensions is None:
            extensions = IMG_EXTENSIONS
        classes, class_to_idx = self._find_classes(self.root)

--- a/image_classification/README.MD
+++ b/image_classification/README.MD
@@ -76,10 +76,10 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch

 | 模型 | top1 acc | top5 acc |
 | --- | --- | --- |
-| [ResNet50](https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams) | 76.28 | 93.04 |
-| [vgg16](https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams) | 71.84 | 90.71 | 
-| [mobilenet_v1](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams) | 71.25 | 89.92 | 
-| [mobilenet_v2](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams) | 72.27 | 90.66 | 
+| [ResNet50](https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams) | 76.27 | 93.03 |
+| [vgg16](https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams) | 71.92 | 90.65 | 
+| [mobilenet_v1](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams) | 71.16 | 89.89 | 
+| [mobilenet_v2](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams) | 72.30 | 90.74 | 

 上述模型的复现参数请参考scripts下的脚本。


--- a/image_classification/imagenet_dataset.py
+++ b/image_classification/imagenet_dataset.py
@@ -19,80 +19,33 @@ import random
 import numpy as np

 from datasets.folder import DatasetFolder
-
-
-def center_crop_resize(img):
-    h, w = img.shape[:2]
-    c = int(224 / 256 * min((h, w)))
-    i = (h + 1 - c) // 2
-    j = (w + 1 - c) // 2
-    img = img[i:i + c, j:j + c, :]
-    return cv2.resize(img, (224, 224), 0, 0, cv2.INTER_LINEAR)
-
-
-def random_crop_resize(img):
-    height, width = img.shape[:2]
-    area = height * width
-
-    for attempt in range(10):
-        target_area = random.uniform(0.08, 1.) * area
-        log_ratio = (math.log(3 / 4), math.log(4 / 3))
-        aspect_ratio = math.exp(random.uniform(*log_ratio))
-
-        w = int(round(math.sqrt(target_area * aspect_ratio)))
-        h = int(round(math.sqrt(target_area / aspect_ratio)))
-
-        if w <= width and h <= height:
-            i = random.randint(0, height - h)
-            j = random.randint(0, width - w)
-            img = img[i:i + h, j:j + w, :]
-            return cv2.resize(img, (224, 224), 0, 0, cv2.INTER_LINEAR)
-
-    return center_crop_resize(img)
-
-
-def random_flip(img):
-    if np.random.randint(0, 2) == 1:
-        img = img[:, ::-1, :]
-    return img
-
-
-def normalize_permute(img):
-    # transpose and convert to RGB from BGR
-    img = img.astype(np.float32).transpose((2, 0, 1))[::-1, ...]
-    mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
-    std = np.array([58.395, 57.120, 57.375], dtype=np.float32)
-    invstd = 1. / std
-    for v, m, s in zip(img, mean, invstd):
-        v.__isub__(m).__imul__(s)
-    return img
-
-
-def compose(functions):
-    def process(sample):
-        img, label = sample
-        for fn in functions:
-            img = fn(img)
-        return img, label
-
-    return process
+from transform import transforms
+from paddle import fluid


 class ImageNetDataset(DatasetFolder):
    def __init__(self, path, mode='train'):
        super(ImageNetDataset, self).__init__(path)
        self.mode = mode
+
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
        if self.mode == 'train':
-            self.transform = compose([
-                cv2.imread, random_crop_resize, random_flip, normalize_permute
+            self.transform = transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.Permute(mode='CHW'), normalize
            ])
        else:
-            self.transform = compose(
-                [cv2.imread, center_crop_resize, normalize_permute])
+            self.transform = transforms.Compose([
+                transforms.Resize(256), transforms.CenterCrop(224),
+                transforms.Permute(mode='CHW'), normalize
+            ])

    def __getitem__(self, idx):
-        img, label = self.samples[idx]
-        return self.transform((img, [label]))
+        img_path, label = self.samples[idx]
+        img = cv2.imread(img_path).astype(np.float32)
+        return self.transform(img), [label]

    def __len__(self):
        return len(self.samples)
--- a/models/mobilenetv1.py
+++ b/models/mobilenetv1.py
@@ -111,13 +111,22 @@ class MobileNetV1(Model):

    Args:
        scale (float): scale of channels in each layer. Default: 1.0.
-        class_dim (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """

-    def __init__(self, scale=1.0, class_dim=1000):
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(MobileNetV1, self).__init__()
        self.scale = scale
        self.dwsl = []
+        self.num_classes = num_classes
+        self.with_pool = with_pool

        self.conv1 = ConvBNLayer(
            num_channels=3,
@@ -227,28 +236,34 @@ class MobileNetV1(Model):
            name="conv6")
        self.dwsl.append(dws6)

-        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        if with_pool:
+            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)

-        self.out = Linear(
-            int(1024 * scale),
-            class_dim,
-            act='softmax',
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-            bias_attr=ParamAttr(name="fc7_offset"))
+        if num_classes > -1:
+            self.out = Linear(
+                int(1024 * scale),
+                num_classes,
+                act=classifier_activation,
+                param_attr=ParamAttr(
+                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+                bias_attr=ParamAttr(name="fc7_offset"))

    def forward(self, inputs):
        y = self.conv1(inputs)
        for dws in self.dwsl:
            y = dws(y)
-        y = self.pool2d_avg(y)
-        y = fluid.layers.reshape(y, shape=[-1, 1024])
-        y = self.out(y)
+
+        if self.with_pool:
+            y = self.pool2d_avg(y)
+
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, 1024])
+            y = self.out(y)
        return y


 def _mobilenet(arch, pretrained=False, **kwargs):
-    model = MobileNetV1(**kwargs)
+    model = MobileNetV1(num_classes=1000, with_pool=True, **kwargs)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
            arch)
@@ -262,5 +277,11 @@ def _mobilenet(arch, pretrained=False, **kwargs):


 def mobilenet_v1(pretrained=False, scale=1.0):
+    """MobileNetV1
+    
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
+    """
    model = _mobilenet('mobilenetv1_' + str(scale), pretrained, scale=scale)
    return model
--- a/models/mobilenetv2.py
+++ b/models/mobilenetv2.py
@@ -156,13 +156,21 @@ class MobileNetV2(Model):

    Args:
        scale (float): scale of channels in each layer. Default: 1.0.
-        class_dim (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """

-    def __init__(self, scale=1.0, class_dim=1000):
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(MobileNetV2, self).__init__()
        self.scale = scale
-        self.class_dim = class_dim
+        self.num_classes = num_classes
+        self.with_pool = with_pool

        bottleneck_params_list = [
            (1, 16, 1, 1),
@@ -174,7 +182,6 @@ class MobileNetV2(Model):
            (6, 320, 1, 1),
        ]

-        #1. conv1 
        self._conv1 = ConvBNLayer(
            num_channels=3,
            num_filters=int(32 * scale),
@@ -182,7 +189,6 @@ class MobileNetV2(Model):
            stride=2,
            padding=1)

-        #2. bottleneck sequences
        self._invl = []
        i = 1
        in_c = int(32 * scale)
@@ -196,7 +202,6 @@ class MobileNetV2(Model):
            self._invl.append(tmp)
            in_c = int(c * scale)

-        #3. last_conv
        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
        self._conv9 = ConvBNLayer(
            num_channels=in_c,
@@ -205,31 +210,34 @@ class MobileNetV2(Model):
            stride=1,
            padding=0)

-        #4. pool
-        self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        if with_pool:
+            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)

-        #5. fc
-        tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-        self._fc = Linear(
-            self._out_c,
-            class_dim,
-            act='softmax',
-            param_attr=tmp_param,
-            bias_attr=ParamAttr(name="fc10_offset"))
+        if num_classes > 0:
+            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+            self._fc = Linear(
+                self._out_c,
+                num_classes,
+                act=classifier_activation,
+                param_attr=tmp_param,
+                bias_attr=ParamAttr(name="fc10_offset"))

    def forward(self, inputs):
        y = self._conv1(inputs, if_act=True)
        for inv in self._invl:
            y = inv(y)
        y = self._conv9(y, if_act=True)
-        y = self._pool2d_avg(y)
-        y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-        y = self._fc(y)
+
+        if self.with_pool:
+            y = self._pool2d_avg(y)
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+            y = self._fc(y)
        return y


 def _mobilenet(arch, pretrained=False, **kwargs):
-    model = MobileNetV2(**kwargs)
+    model = MobileNetV2(num_classes=1000, with_pool=True, **kwargs)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
            arch)
@@ -246,7 +254,8 @@ def mobilenet_v2(pretrained=False, scale=1.0):
    """MobileNetV2
    
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
    """
    model = _mobilenet('mobilenetv2_' + str(scale), pretrained, scale=scale)
    return model
--- a/models/resnet.py
+++ b/models/resnet.py
@@ -163,12 +163,23 @@ class ResNet(Model):
    Args:
        Block (BasicBlock|BottleneckBlock): block module of model.
        depth (int): layers of resnet, default: 50.
-        num_classes (int): output dim of last fc layer, default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """

-    def __init__(self, Block, depth=50, num_classes=1000):
+    def __init__(self,
+                 Block,
+                 depth=50,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(ResNet, self).__init__()

+        self.num_classes = num_classes
+        self.with_pool = with_pool
+
        layer_config = {
            18: [2, 2, 2, 2],
            34: [3, 4, 6, 3],
@@ -212,31 +223,37 @@ class ResNet(Model):
                                      Sequential(*blocks))
            self.layers.append(layer)

-        self.global_pool = Pool2D(
-            pool_size=7, pool_type='avg', global_pooling=True)
+        if with_pool:
+            self.global_pool = Pool2D(
+                pool_size=7, pool_type='avg', global_pooling=True)

-        stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-        self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-        self.fc = Linear(
-            self.fc_input_dim,
-            num_classes,
-            act='softmax',
-            param_attr=fluid.param_attr.ParamAttr(
-                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        if num_classes > 0:
+            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
+            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
+            self.fc = Linear(
+                self.fc_input_dim,
+                num_classes,
+                act=classifier_activation,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))

    def forward(self, inputs):
        x = self.conv(inputs)
        x = self.pool(x)
        for layer in self.layers:
            x = layer(x)
-        x = self.global_pool(x)
-        x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
-        x = self.fc(x)
+
+        if self.with_pool:
+            x = self.global_pool(x)
+
+        if self.num_classes > -1:
+            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            x = self.fc(x)
        return x


 def _resnet(arch, Block, depth, pretrained):
-    model = ResNet(Block, depth)
+    model = ResNet(Block, depth, num_classes=1000, with_pool=True)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
            arch)

--- a/models/vgg.py
+++ b/models/vgg.py
@@ -23,12 +23,8 @@ from .download import get_weights_path
 __all__ = [
    'VGG',
    'vgg11',
-    'vgg11_bn',
    'vgg13',
-    'vgg13_bn',
    'vgg16',
-    'vgg16_bn',
-    'vgg19_bn',
    'vgg19',
 ]

@@ -39,11 +35,11 @@ model_urls = {


 class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes):
+    def __init__(self, num_classes, classifier_activation='softmax'):
        super(Classifier, self).__init__()
        self.linear1 = Linear(512 * 7 * 7, 4096)
        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act='softmax')
+        self.linear3 = Linear(4096, num_classes, act=classifier_activation)

    def forward(self, x):
        x = self.linear1(x)
@@ -62,20 +58,30 @@ class VGG(Model):

    Args:
        features (fluid.dygraph.Layer): vgg features create by function make_layers.
-        num_classes (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """

-    def __init__(self, features, num_classes=1000):
+    def __init__(self,
+                 features,
+                 num_classes=1000,
+                 classifier_activation='softmax'):
        super(VGG, self).__init__()
        self.features = features
-        classifier = Classifier(num_classes)
-        self.classifier = self.add_sublayer("classifier",
-                                            Sequential(classifier))
+        self.num_classes = num_classes
+
+        if num_classes > 0:
+            classifier = Classifier(num_classes, classifier_activation)
+            self.classifier = self.add_sublayer("classifier",
+                                                Sequential(classifier))

    def forward(self, x):
        x = self.features(x)
-        x = fluid.layers.flatten(x, 1)
-        x = self.classifier(x)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.classifier(x)
        return x


@@ -114,7 +120,10 @@ cfgs = {


 def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
-    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+    model = VGG(make_layers(
+        cfgs[cfg], batch_norm=batch_norm),
+                num_classes=1000,
+                **kwargs)

    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
@@ -128,73 +137,53 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
    return model


-def vgg11(pretrained=False, **kwargs):
+def vgg11(pretrained=False, batch_norm=False):
    """VGG 11-layer model
    
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
    """
-    return _vgg('vgg11', 'A', False, pretrained, **kwargs)
-
+    model_name = 'vgg11'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'A', batch_norm, pretrained)

-def vgg11_bn(pretrained=False, **kwargs):
-    """VGG 11-layer model with batch normalization
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    return _vgg('vgg11_bn', 'A', True, pretrained, **kwargs)

-
-def vgg13(pretrained=False, **kwargs):
+def vgg13(pretrained=False, batch_norm=False):
    """VGG 13-layer model
    
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    return _vgg('vgg13', 'B', False, pretrained, **kwargs)
-
-
-def vgg13_bn(pretrained=False, **kwargs):
-    """VGG 13-layer model with batch normalization
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
    """
-    return _vgg('vgg13_bn', 'B', True, pretrained, **kwargs)
+    model_name = 'vgg13'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'B', batch_norm, pretrained)


-def vgg16(pretrained=False, **kwargs):
+def vgg16(pretrained=False, batch_norm=False):
    """VGG 16-layer model 
    
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    return _vgg('vgg16', 'D', False, pretrained, **kwargs)
-
-
-def vgg16_bn(pretrained=False, **kwargs):
-    """VGG 16-layer with batch normalization
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
    """
-    return _vgg('vgg16_bn', 'D', True, pretrained, **kwargs)
+    model_name = 'vgg16'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'D', batch_norm, pretrained)


-def vgg19(pretrained=False, **kwargs):
+def vgg19(pretrained=False, batch_norm=False):
    """VGG 19-layer model 
    
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    return _vgg('vgg19', 'E', False, pretrained, **kwargs)
-
-
-def vgg19_bn(pretrained=False, **kwargs):
-    """VGG 19-layer model with batch normalization
-    
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
    """
-    return _vgg('vgg19_bn', 'E', True, pretrained, **kwargs)
+    model_name = 'vgg19'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'E', batch_norm, pretrained)
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import time
 import random

--- a/tests/test_data/class_a/ILSVRC2012_val_00000293.JPEG
+++ b/tests/test_data/class_a/ILSVRC2012_val_00000293.JPEG
--- a/tests/test_data/class_a/ILSVRC2012_val_00002138.JPEG
+++ b/tests/test_data/class_a/ILSVRC2012_val_00002138.JPEG
--- a/tests/test_data/class_b/ILSVRC2012_val_00000236.JPEG
+++ b/tests/test_data/class_b/ILSVRC2012_val_00000236.JPEG
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+
+from datasets.folder import DatasetFolder
+
+
+class TestFolderDatasets(unittest.TestCase):
+    def test_dataset(self):
+        dataset_folder = DatasetFolder('test_data')
+
+        for _ in dataset_folder:
+            pass
+
+        assert len(dataset_folder) == 3
+        assert len(dataset_folder.classes) == 2
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -15,13 +15,13 @@
 from __future__ import division
 from __future__ import print_function

+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+
 import unittest

 import os

-import sys
-sys.path.append('../')
-
 import numpy as np
 import contextlib


--- a/tests/test_progressbar.py
+++ b/tests/test_progressbar.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import random
 import time

--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+
+from datasets.folder import DatasetFolder
+from transform import transforms
+
+
+class TestTransforms(unittest.TestCase):
+    def do_transform(self, trans):
+        dataset_folder = DatasetFolder('test_data', transform=trans)
+
+        for _ in dataset_folder:
+            pass
+
+    def test_trans0(self):
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
+        trans = transforms.Compose([
+            transforms.RandomResizedCrop(224), transforms.GaussianNoise(),
+            transforms.ColorJitter(
+                brightness=0.4, contrast=0.4, saturation=0.4,
+                hue=0.4), transforms.RandomHorizontalFlip(),
+            transforms.Permute(mode='CHW'), normalize
+        ])
+
+        self.do_transform(trans)
+
+    def test_trans1(self):
+        trans = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+        ])
+        self.do_transform(trans)
+
+    def test_trans2(self):
+        trans = transforms.Compose([transforms.CenterCropResize(224)])
+        self.do_transform(trans)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/transform/__init__.py
+++ b/transform/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .transforms import *
--- a/transform/functional.py
+++ b/transform/functional.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import collections
+import random
+
+import cv2
+import numpy as np
+
+if sys.version_info < (3, 3):
+    Sequence = collections.Sequence
+    Iterable = collections.Iterable
+else:
+    Sequence = collections.abc.Sequence
+    Iterable = collections.abc.Iterable
+
+
+def flip(image, code):
+    """
+    Accordding to the code (the type of flip), flip the input image
+
+    Args:
+        image: Input image, with (H, W, C) shape
+        code: code that indicates the type of flip.
+            -1 : Flip horizontally and vertically
+            0 : Flip vertically
+            1 : Flip horizontally
+    """
+    return cv2.flip(image, flipCode=code)
+
+
+def resize(img, size, interpolation=cv2.INTER_LINEAR):
+    """
+    resize the input data to given size
+
+    Args:
+        input: Input data, could be image or masks, with (H, W, C) shape
+        size: Target size of input data, with (height, width) shape.
+        interpolation: Interpolation method.
+    """
+
+    if isinstance(interpolation, Sequence):
+        interpolation = random.choice(interpolation)
+
+    if isinstance(size, int):
+        h, w = img.shape[:2]
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+            return cv2.resize(img, (ow, oh), interpolation=interpolation)
+        else:
+            oh = size
+            ow = int(size * w / h)
+            return cv2.resize(img, (ow, oh), interpolation=interpolation)
+    else:
+        return cv2.resize(img, size[::-1], interpolation=interpolation)
--- a/transform/transforms.py
+++ b/transform/transforms.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import math
+import sys
+import random
+import cv2
+
+import numpy as np
+import numbers
+import types
+import collections
+import warnings
+
+from . import functional as F
+
+if sys.version_info < (3, 3):
+    Iterable = collections.Iterable
+else:
+    Iterable = collections.abc.Iterable
+
+__all__ = [
+    "Compose",
+    "Resize",
+    "RandomResizedCrop",
+    "CenterCropResize",
+    "CenterCrop",
+    "RandomHorizontalFlip",
+    "RandomVerticalFlip",
+    "Permute",
+    "Normalize",
+    "GaussianNoise",
+    "BrightnessTransform",
+    "SaturationTransform",
+    "ContrastTransform",
+    "HueTransform",
+    "ColorJitter",
+]
+
+
+class Compose(object):
+    """Composes several transforms together.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+class Resize(object):
+    """Resize the input PIL Image to the given size.
+
+    Args:
+        size (int|list|tuple): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int): interpolation mode of resize. Default: cv2.INTER_LINEAR.
+    """
+
+    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+        assert isinstance(size, int) or (isinstance(size, Iterable) and
+                                         len(size) == 2)
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be scaled.
+
+        Returns:
+            PIL Image: Rescaled image.
+        """
+        return F.resize(img, self.size, self.interpolation)
+
+
+class RandomResizedCrop(object):
+    """Crop the input data to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
+    After applying crop transfrom, the input data will be resized to given size.
+
+    Args:
+        output_size (int|list|tuple): Target size of output image, with (height, width) shape.
+        scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
+        ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+    """
+
+    def __init__(self,
+                 output_size,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4, 4. / 3),
+                 interpolation=cv2.INTER_LINEAR):
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+        assert (scale[0] <= scale[1]), "scale should be of kind (min, max)"
+        assert (ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+
+    def _get_params(self, image, attempts=10):
+        height, width, _ = image.shape
+        area = height * width
+
+        for _ in range(attempts):
+            target_area = np.random.uniform(*self.scale) * area
+            log_ratio = tuple(math.log(x) for x in self.ratio)
+            aspect_ratio = math.exp(np.random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                x = np.random.randint(0, width - w + 1)
+                y = np.random.randint(0, height - h + 1)
+                return x, y, w, h
+
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if in_ratio < min(self.ratio):
+            w = width
+            h = int(round(w / min(self.ratio)))
+        elif in_ratio > max(self.ratio):
+            h = height
+            w = int(round(h * max(self.ratio)))
+        else:  # whole image
+            w = width
+            h = height
+        x = (width - w) // 2
+        y = (height - h) // 2
+        return x, y, w, h
+
+    def __call__(self, img):
+        x, y, w, h = self._get_params(img)
+        cropped_img = img[y:y + h, x:x + w]
+        return F.resize(cropped_img, self.output_size, self.interpolation)
+
+
+class CenterCropResize(object):
+    """Crops to center of image with padding then scales size.
+
+    Args:
+        size (int|list|tuple): Target size of output image, with (height, width) shape.
+        crop_padding (int): center crop with the padding. Default: 32.
+        interpolation (int): interpolation mode of resize. Default: cv2.INTER_LINEAR.
+    """
+
+    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
+        if isinstance(size, int):
+            self.size = (size, size)
+        else:
+            self.size = size
+        self.crop_padding = crop_padding
+        self.interpolation = interpolation
+
+    def _get_params(self, img):
+        h, w = img.shape[:2]
+        size = min(self.size)
+        c = int(size / (size + self.crop_padding) * min((h, w)))
+        x = (h + 1 - c) // 2
+        y = (w + 1 - c) // 2
+        return c, x, y
+
+    def __call__(self, img):
+        c, x, y = self._get_params(img)
+        cropped_img = img[x:x + c, y:y + c, :]
+        return F.resize(cropped_img, self.size, self.interpolation)
+
+
+class CenterCrop(object):
+    """Crops the given the input data at the center.
+
+    Args:
+        output_size: Target size of output image, with (height, width) shape.
+    """
+
+    def __init__(self, output_size):
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+
+    def _get_params(self, img):
+        th, tw = self.output_size
+        h, w, _ = img.shape
+        assert th <= h and tw <= w, "output size is bigger than image size"
+        x = int(round((w - tw) / 2.0))
+        y = int(round((h - th) / 2.0))
+        return x, y
+
+    def __call__(self, img):
+        x, y = self._get_params(img)
+        th, tw = self.output_size
+        return img[y:y + th, x:x + tw]
+
+
+class RandomHorizontalFlip(object):
+    """Horizontally flip the input data randomly with a given probability.
+
+    Args:
+        prob (float): probability of the input data being flipped. Default: 0.5
+    """
+
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            return F.flip(img, code=1)
+        return img
+
+
+class RandomVerticalFlip(object):
+    """Vertically flip the input data randomly with a given probability.
+
+    Args:
+        prob (float): probability of the input data being flipped. Default: 0.5
+    """
+
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            return F.flip(img, code=0)
+        return img
+
+
+class Normalize(object):
+    """Normalize the input data with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
+    this transform will normalize each channel of the input data.
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    Args:
+        mean (int|float|list): Sequence of means for each channel.
+        std (int|float|list): Sequence of standard deviations for each channel.
+    
+    """
+
+    def __init__(self, mean=0.0, std=1.0):
+        if isinstance(mean, numbers.Number):
+            mean = [mean, mean, mean]
+
+        if isinstance(std, numbers.Number):
+            mean = [std, std, std]
+
+        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
+        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
+
+    def __call__(self, img):
+        return (img - self.mean) / self.std
+
+
+class Permute(object):
+    """Change input data to a target mode.
+    For example, most transforms use HWC mode image,
+    while the Neural Network might use CHW mode input tensor.
+    Input image should be HWC mode and an instance of numpy.ndarray. 
+
+    Args:
+        mode: Output mode of input. Use "CHW" mode by default.
+    """
+
+    def __init__(self, mode="CHW"):
+        assert mode in [
+            "CHW"
+        ], "Only support 'CHW' mode, but received mode: {}".format(mode)
+        self.mode = mode
+
+    def __call__(self, img):
+        if self.mode == "CHW":
+            return img.transpose((2, 0, 1))[::-1, ...]
+        return img
+
+
+class GaussianNoise(object):
+    """Add random gaussian noise to the input data.
+    Gaussian noise is generated with given mean and std.
+
+    Args:
+        mean: Gaussian mean used to generate noise.
+        std: Gaussian standard deviation used to generate noise.
+    """
+
+    def __init__(self, mean=0.0, std=1.0):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def __call__(self, img):
+        dtype = img.dtype
+        noise = np.random.normal(self.mean, self.std, img.shape) * 255
+        img = img + noise.astype(np.float32)
+        return np.clip(img, 0, 255).astype(dtype)
+
+
+class BrightnessTransform(object):
+    """Adjust brightness of the image.
+
+    Args:
+        value: How much to adjust the brightness. Can be any
+            non negative number. 0 gives the original image
+    """
+
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("brightness value should be non-negative")
+        self.value = value
+
+    def __call__(self, img):
+        if self.value == 0:
+            return img
+
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        img = img * alpha
+        return img.clip(0, 255).astype(dtype)
+
+
+class ContrastTransform(object):
+    """Adjust contrast of the image.
+
+    Args:
+        value: How much to adjust the contrast. Can be any
+            non negative number. 0 gives the original image
+    """
+
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("contrast value should be non-negative")
+        self.value = value
+
+    def __call__(self, img):
+        if self.value == 0:
+            return img
+
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
+            1 - alpha)
+        return img.clip(0, 255).astype(dtype)
+
+
+class SaturationTransform(object):
+    """Adjust saturation of the image.
+
+    Args:
+        value: How much to adjust the saturation. Can be any
+            non negative number. 0 gives the original image
+    """
+
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("saturation value should be non-negative")
+        self.value = value
+
+    def __call__(self, img):
+        if self.value == 0:
+            return img
+
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        gray_img = gray_img[..., np.newaxis]
+        img = img * alpha + gray_img * (1 - alpha)
+        return img.clip(0, 255).astype(dtype)
+
+
+class HueTransform(object):
+    """Adjust hue of the image.
+
+    Args:
+        value: How much to adjust the hue. Can be any number
+            between 0 and 0.5, 0 gives the original image
+    """
+
+    def __init__(self, value):
+        if value < 0 or value > 0.5:
+            raise ValueError("hue value should be in [0.0, 0.5]")
+        self.value = value
+
+    def __call__(self, img):
+        if self.value == 0:
+            return img
+
+        dtype = img.dtype
+        img = img.astype(np.uint8)
+        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+        h, s, v = cv2.split(hsv_img)
+
+        alpha = np.random.uniform(-self.value, self.value)
+        h = h.astype(np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over="ignore"):
+            h += np.uint8(alpha * 255)
+        hsv_img = cv2.merge([h, s, v])
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+
+
+class ColorJitter(object):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+
+    Args:
+        brightness: How much to jitter brightness.
+            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast: How much to jitter contrast.
+            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation: How much to jitter saturation.
+            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue: How much to jitter hue.
+            Chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        transforms = []
+        if brightness != 0:
+            transforms.append(BrightnessTransform(brightness))
+        if contrast != 0:
+            transforms.append(ContrastTransform(contrast))
+        if saturation != 0:
+            transforms.append(SaturationTransform(saturation))
+        if hue != 0:
+            transforms.append(HueTransform(hue))
+
+        random.shuffle(transforms)
+        self.transforms = Compose(transforms)
+
+    def __call__(self, img):
+        return self.transforms(img)