finish README.md and update code

65b20788 · dangqingqing · 3088707a · 65b20788 · 65b20788 · 65b20788
13 changed file
--- a/image_classification/README.md
+++ b/image_classification/README.md
--- a/image_classification/prediction.py
+++ b/image_classification/prediction.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,44 +12,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os, sys
+import os,sys
+import cPickle
 import numpy as np
-import logging
 from PIL import Image
 from optparse import OptionParser
 import paddle.utils.image_util as image_util
 from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
-logging.basicConfig(
+import logging
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 logging.getLogger().setLevel(logging.INFO)
+def vis_square(data, fname):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    """Take an array of shape (n, height, width) or (n, height, width, 3)
+       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
+    # normalize data for display
+    data = (data - data.min()) / (data.max() - data.min())
+    # force the number of filters to be square
+    n = int(np.ceil(np.sqrt(data.shape[0])))
+    padding = (((0, n ** 2 - data.shape[0]),
+               (0, 1), (0, 1))                 # add some space between filters
+               + ((0, 0),) * (data.ndim - 3))  # don't pad the last dimension (if there is one)
+    data = np.pad(data, padding, mode='constant', constant_values=1)  # pad with ones (white)
+    # tile the filters into an image
+    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
+    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
+    plt.imshow(data, cmap='gray')
+    plt.savefig(fname)
+    plt.axis('off')
 class ImageClassifier():
    def __init__(self,
                 train_conf,
-                 use_gpu=True,
+                 resize_dim,
+                 crop_dim,
                 model_dir=None,
-                 resize_dim=None,
+                 use_gpu=True,
-                 crop_dim=None,
                 mean_file=None,
                 oversample=False,
                 is_color=True):
-        """
-        train_conf: network configure.
-        model_dir: string, directory of model.
-        resize_dim: int, resized image size.
-        crop_dim: int, crop size.
-        mean_file: string, image mean file.
-        oversample: bool, oversample means multiple crops, namely five
-                    patches (the four corner patches and the center
-                    patch) as well as their horizontal reflections,
-                    ten crops in all.
-        """
        self.train_conf = train_conf
        self.model_dir = model_dir
        if model_dir is None:
@@ -60,47 +70,56 @@ class ImageClassifier():
        self.oversample = oversample
        self.is_color = is_color
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
+        self.transformer = image_util.ImageTransformer(is_color = is_color)
-        self.transformer.set_transpose((2, 0, 1))
+        self.transformer.set_transpose((2,0,1))
+        self.transformer.set_channel_swap((2,1,0))
        self.mean_file = mean_file
-        mean = np.load(self.mean_file)['data_mean']
+        if self.mean_file is not None:
-        mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
+            mean = np.load(self.mean_file)['mean']
-        self.transformer.set_mean(mean)  # mean pixel
+            mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-        gpu = 1 if use_gpu else 0
+            self.transformer.set_mean(mean) # mean pixel
-        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
+        else:
+            # if you use three mean value, set like:
+            # this three mean value is calculated from ImageNet.
+            self.transformer.set_mean(np.array([103.939,116.779,123.68]))
+        conf_args = "use_gpu=%d,is_predict=1" % (int(use_gpu))
        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
+        swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
-            conf.model_config)
        assert isinstance(self.network, swig_paddle.GradientMachine)
        self.network.loadParameters(self.model_dir)
-        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
+        dim = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(data_size)]
+        slots = [dense_vector(dim)]
        self.converter = DataProviderConverter(slots)
    def get_data(self, img_path):
        """
        1. load image from img_path.
        2. resize or oversampling.
-        3. transformer data: transpose, sub mean.
+        3. transformer data: transpose, channel swap, sub mean.
        return K x H x W ndarray.
        img_path: image path.
        """
        image = image_util.load_image(img_path, self.is_color)
+        # Another way to extract oversampled features is that
+        # cropping and averaging from large feature map which is
+        # calculated by large size of image.
+        # This way reduces the computation.
        if self.oversample:
-            # image_util.resize_image: short side is self.resize_dim
            image = image_util.resize_image(image, self.resize_dim)
            image = np.array(image)
-            input = np.zeros(
+            input = np.zeros((1, image.shape[0], image.shape[1], 3),
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
+                             dtype=np.float32)
            input[0] = image.astype(np.float32)
            input = image_util.oversample(input, self.crop_dims)
        else:
            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
+            input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
+                             dtype=np.float32)
            input[0] = np.array(image).astype(np.float32)
        data_in = []
@@ -114,46 +133,91 @@ class ImageClassifier():
        return self.network.forwardTest(in_arg)
    def forward(self, data, output_layer):
-        """
-        input_data: py_paddle input data.
-        output_layer: specify the name of probability, namely the layer with
-                      softmax activation.
-        return: the predicting probability of each label.
-        """
        input = self.converter(data)
        self.network.forwardTest(input)
        output = self.network.getLayerOutputs(output_layer)
-        # For oversampling, average predictions across crops.
+        res = {}
-        # If not, the shape of output[name]: (1, class_number),
+        if isinstance(output_layer, basestring):
-        # the mean is also applicable.
+            output_layer = [output_layer]
-        return output[output_layer].mean(0)
+        for name in output_layer:
+            # For oversampling, average predictions across crops.
-    def predict(self, image=None, output_layer=None):
+            # If not, the shape of output[name]: (1, class_number),
-        assert isinstance(image, basestring)
+            # the mean is also applicable.
-        assert isinstance(output_layer, basestring)
+            res[name] = output[name].mean(0)
-        data = self.get_data(image)
+        return res
-        prob = self.forward(data, output_layer)
-        lab = np.argsort(-prob)
+def option_parser():
-        logging.info("Label of %s is: %d", image, lab[0])
+    usage = "%prog -c config -i data_list -w model_dir [options]"
+    parser = OptionParser(usage="usage: %s" % usage)
+    parser.add_option("--job",
+                      action="store",
+                      dest="job_type",
+                      choices=['predict', 'extract',],
+                      default='predict',
+                      help="The job type. \
+                            predict: predicting,\
+                            extract: extract features")
+    parser.add_option("--conf",
+                      action="store",
+                      dest="train_conf",
+                      default='models/vgg.py',
+                      help="network config")
+    parser.add_option("--data",
+                      action="store",
+                      dest="data_file",
+                      default='image/dog.png',
+                      help="image list")
+    parser.add_option("--model",
+                      action="store",
+                      dest="model_path",
+                      default=None,
+                      help="model path")
+    parser.add_option("-c",
+                      dest="cpu_gpu",
+                      action="store_false",
+                      help="Use cpu mode.")
+    parser.add_option("-g",
+                      dest="cpu_gpu",
+                      default=True,
+                      action="store_true",
+                      help="Use gpu mode.")
+    parser.add_option("--mean",
+                      action="store",
+                      dest="mean",
+                      default='data/mean.meta',
+                      help="The mean file.")
+    parser.add_option("--multi_crop",
+                      action="store_true",
+                      dest="multi_crop",
+                      default=False,
+                      help="Wether to use multiple crops on image.")
+    return parser.parse_args()
+def main():
+    options, args = option_parser()
+    mean = 'data/mean.meta' if not options.mean else options.mean
+    conf = 'models/vgg.py' if not options.train_conf else options.train_conf
+    obj = ImageClassifier(conf,
+                          32,32,
+                          options.model_path,
+                          use_gpu=options.cpu_gpu,
+                          mean_file=mean,
+                          oversample=options.multi_crop)
+    image_path = options.data_file
+    if options.job_type == 'predict':
+        output_layer = '__fc_layer_2__'
+        data = obj.get_data(image_path)
+        prob = obj.forward(data, output_layer)
+        lab = np.argsort(-prob[output_layer])
+        logging.info("Label of %s is: %d", image_path, lab[0])
+    elif options.job_type == "extract":
+        output_layer = '__conv_0__'
+        data = obj.get_data(options.data_file)
+        features = obj.forward(data, output_layer)
+        dshape = (64, 32, 32)
+        fea = features[output_layer].reshape(dshape)
+        vis_square(fea, 'fea_conv0.png')
 if __name__ == '__main__':
-    image_size = 32
+    main()
-    crop_size = 32
-    multi_crop = True
-    config = "vgg_16_cifar.py"
-    output_layer = "__fc_layer_1__"
-    mean_path = "data/batches.meta"
-    model_path = sys.argv[1]
-    image = sys.argv[2]
-    use_gpu = bool(int(sys.argv[3]))
-    obj = ImageClassifier(
-        train_conf=config,
-        model_dir=model_path,
-        resize_dim=image_size,
-        crop_dim=crop_size,
-        mean_file=mean_path,
-        use_gpu=use_gpu,
-        oversample=multi_crop)
-    obj.predict(image, output_layer)
--- a/image_classification/dataprovider.py
+++ b/image_classification/dataprovider.py
@@ -14,7 +14,6 @@
 import numpy as np
 import cPickle
 from paddle.trainer.PyDataProvider2 import *
 def initializer(settings, mean_path, is_train, **kwargs):

--- a/image_classification/extract.sh
+++ b/image_classification/extract.sh
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
--- a/image_classification/image/dog.png
+++ b/image_classification/image/dog.png
--- a/image_classification/image/image_classification.png
+++ b/image_classification/image/image_classification.png
--- a/image_classification/image/fea_conv0.png
+++ b/image_classification/image/fea_conv0.png
--- a/image_classification/image/inception.png
+++ b/image_classification/image/inception.png
--- a/image_classification/image/resnet.png
+++ b/image_classification/image/resnet.png
--- a/image_classification/models/resnet.py
+++ b/image_classification/models/resnet.py
@@ -14,13 +14,33 @@
 from paddle.trainer_config_helpers import *
+is_predict = get_config_arg("is_predict", bool, False)
+if not is_predict:
+    args = {'meta': 'data/mean.meta'}
+    define_py_data_sources2(
+        train_list='data/train.list',
+        test_list='data/test.list',
+        module='dataprovider',
+        obj='process',
+        args=args)
+settings(
+    batch_size=128,
+    learning_rate=0.1 / 128.0,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=50000 * 100,
+    learning_rate_schedule='discexp',
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0001 * 128))
 def conv_bn_layer(input,
                  ch_out,
                  filter_size,
                  stride,
                  padding,
-                  ch_in=None,
+                  active_type=ReluActivation(),
-                  active_type=ReluActivation()):
+                  ch_in=None):
    tmp = img_conv_layer(
        input=input,
        filter_size=filter_size,
@@ -35,16 +55,16 @@ def conv_bn_layer(input,
 def shortcut(ipt, n_in, n_out, stride):
    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride=stride, LinearActivation())
+        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
    else:
        return ipt
 def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
+    ch_in = ipt.num_filters
    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[input, short], act=ReluActivation())
+    return addto_layer(input=[ipt, short], act=ReluActivation())
 def bottleneck(ipt, ch_out, stride):
    ch_in = ipt.num_filter
@@ -52,13 +72,13 @@ def bottleneck(ipt, ch_out, stride):
    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[input, short], act=ReluActivation())
+    return addto_layer(input=[ipt, short], act=ReluActivation())
 def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(tmp, features, stride)
+    tmp = block_func(ipt, features, stride)
    for i in range(1, count):
        tmp = block_func(tmp, features, 1)
-        return tmp
+    return tmp
 def resnet_imagenet(ipt, depth=50):
    cfg = {18 : ([2,2,2,1], basicblock),
@@ -96,42 +116,23 @@ def resnet_cifar10(ipt, depth=56):
        filter_size=3,
        stride=1,
        padding=1)
-    tmp = layer_warp(basicblock, tmp, 16, n)
+    tmp = layer_warp(basicblock, tmp, 16, n, 1)
    tmp = layer_warp(basicblock, tmp, 32, n, 2)
    tmp = layer_warp(basicblock, tmp, 64, n, 2)
    tmp = img_pool_layer(input=tmp,
                         pool_size=8,
                         stride=1,
                         pool_type=AvgPooling())
-    tmp = fc_layer(input=tmp, size=10, act=SoftmaxActivation())
    return tmp
-is_predict = get_config_arg("is_predict", bool, False)
+datadim = 3 * 32 * 32
-if not is_predict:
+classdim = 10
-    args = {'meta': 'data/mean.meta'}
+data = data_layer(name='image', size=datadim)
-    define_py_data_sources2(
+net = resnet_cifar10(data, depth=56)
-        train_list='data/train.list',
+out = fc_layer(input=net, size=10, act=SoftmaxActivation())
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args=args)
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-data_size = 3 * 32 * 32
-class_num = 10
-data = data_layer(name='image', size=data_size)
-out = resnet_cifar10(data, depth=50)
 if not is_predict:
-    lbl = data_layer(name="label", size=class_num)
+    lbl = data_layer(name="label", size=classdim)
    outputs(classification_cost(input=out, label=lbl))
 else:
    outputs(out)
--- a/image_classification/models/vgg.py
+++ b/image_classification/models/vgg.py
@@ -14,11 +14,30 @@
 from paddle.trainer_config_helpers import *
-def vgg_bn_drop(input, num_channels):
+is_predict = get_config_arg("is_predict", bool, False)
-    def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
+if not is_predict:
+    define_py_data_sources2(
+        train_list='data/train.list',
+        test_list='data/test.list',
+        module='dataprovider',
+        obj='process',
+        args={'mean_path': 'data/mean.meta'})
+settings(
+    batch_size=128,
+    learning_rate=0.1 / 128.0,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=50000 * 100,
+    learning_rate_schedule='discexp',
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * 128),)
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
        return img_conv_group(
            input=ipt,
-            num_channels=num_channels_,
+            num_channels=num_channels,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
@@ -28,7 +47,7 @@ def vgg_bn_drop(input, num_channels):
            conv_batchnorm_drop_rate=dropouts,
            pool_type=MaxPooling())
-    tmp = conv_block(input, 64, 2, [0.3, 0], num_channels)
+    tmp = conv_block(input, 64, 2, [0.3, 0], 3)
    tmp = conv_block(tmp, 128, 2, [0.4, 0])
    tmp = conv_block(tmp, 256, 3, [0.4, 0.4, 0])
    tmp = conv_block(tmp, 512, 3, [0.4, 0.4, 0])
@@ -46,33 +65,16 @@ def vgg_bn_drop(input, num_channels):
        input=tmp,
        size=512,
        act=LinearActivation())
-    tmp = fc_layer(input=tmp, size=10, act=SoftmaxActivation())
    return tmp
-is_predict = get_config_arg("is_predict", bool, False)
+datadim = 3 * 32 * 32
-if not is_predict:
+classdim = 10
-    define_py_data_sources2(
+data = data_layer(name='image', size=datadim)
-        train_list='data/train.list',
+net = vgg_bn_drop(data)
-        test_list='data/test.list',
+out = fc_layer(input=net, size=classdim, act=SoftmaxActivation())
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128),)
-data_size = 3 * 32 * 32
-class_num = 10
-data = data_layer(name='image', size=data_size)
-out = vgg_bn_drop(data, 3)
 if not is_predict:
-    lbl = data_layer(name="label", size=class_num)
+    lbl = data_layer(name="label", size=classdim)
-    outputs(classification_cost(input=out, label=lbl))
+    cost = classification_cost(input=out, label=lbl)
+    outputs(cost)
 else:
    outputs(out)
--- a/image_classification/predict.sh
+++ b/image_classification/predict.sh
@@ -14,7 +14,4 @@
 # limitations under the License.
 set -e
-model=output/pass-00299/
+python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
--- a/image_classification/train.sh
+++ b/image_classification/train.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 set -e
-#config=models/resnet.py
+config=models/resnet.py
-config=models/vgg.py
+#config=models/vgg.py
-output=./output
+output=output
 log=train.log
 paddle train \
@@ -26,4 +26,4 @@ paddle train \
    --log_period=100 \
    --num_passes=300 \
    --save_dir=$output
-    2>&1 | tee $log
+    #2>&1 | tee $log