diff --git a/README.cn.md b/README.cn.md index 9491690e3dbcca47eab68ded994d9e8ce3d1c507..3b2403bb513383e57e87a1e6de19943c13da5b3f 100644 --- a/README.cn.md +++ b/README.cn.md @@ -98,12 +98,13 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。图像分类是根据图像的语义信息对不同类别图像进行区分,是计算机视觉中重要的基础问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础,在许多领域都有着广泛的应用。如:安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。 -在图像分类任务中,我们向大家介绍如何训练AlexNet、VGG、GoogLeNet和ResNet模型。同时提供了一个够将Caffe训练好的模型文件转换为PaddlePaddle模型文件的模型转换工具。 +在图像分类任务中,我们向大家介绍如何训练AlexNet、VGG、GoogLeNet、ResNet和Inception-v4模型。同时提供了一个够将Caffe训练好的模型文件转换为PaddlePaddle模型文件的模型转换工具。 - 11.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle) - 11.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification) - 11.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification) - 11.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification) +- 11.5 [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/image_classification) ## 12. 目标检测 diff --git a/README.md b/README.md index 8b938a30dcee7e1373620065df80f7f85d461820..876c5621cc25f1867947d27b99123ae7156c270c 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,12 @@ As an example for sequence-to-sequence learning, we take the machine translation ## 9. Image classification -For the example of image classification, we show you how to train AlexNet, VGG, GoogLeNet and ResNet models in PaddlePaddle. It also provides a model conversion tool that converts Caffe trained model files into PaddlePaddle model files. +For the example of image classification, we show you how to train AlexNet, VGG, GoogLeNet, ResNet and Inception-v4 models in PaddlePaddle. It also provides a model conversion tool that converts Caffe trained model files into PaddlePaddle model files. - 9.1 [convert Caffe model file to PaddlePaddle model file](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle) - 9.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification) - 9.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification) - 9.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification) +- 9.5 [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/image_classification) This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](LICENSE). diff --git a/image_classification/README.md b/image_classification/README.md index 843d683c004865489579f684cda15218e5c77379..b160d3efd51eb28c06a5bdfc00c88dd113c050ca 100644 --- a/image_classification/README.md +++ b/image_classification/README.md @@ -1,7 +1,7 @@ 图像分类 ======================= -这里将介绍如何在PaddlePaddle下使用AlexNet、VGG、GoogLeNet和ResNet模型进行图像分类。图像分类问题的描述和这四种模型的介绍可以参考[PaddlePaddle book](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)。 +这里将介绍如何在PaddlePaddle下使用AlexNet、VGG、GoogLeNet、ResNet和Inception-v4模型进行图像分类。图像分类问题的描述和这些模型的介绍可以参考[PaddlePaddle book](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)。 ## 训练模型 @@ -18,6 +18,7 @@ import vgg import resnet import alexnet import googlenet +import inception_v4 # PaddlePaddle init @@ -41,7 +42,7 @@ lbl = paddle.layer.data( ### 获得所用模型 -这里可以选择使用AlexNet、VGG、GoogLeNet和ResNet模型中的一个模型进行图像分类。通过调用相应的方法可以获得网络最后的Softmax层。 +这里可以选择使用AlexNet、VGG、GoogLeNet、ResNet和Inception-v4模型中的一个模型进行图像分类。通过调用相应的方法可以获得网络最后的Softmax层。 1. 使用AlexNet模型 @@ -86,6 +87,14 @@ ResNet模型可以通过下面的代码获取: out = resnet.resnet_imagenet(image, class_dim=CLASS_DIM) ``` +5. 使用Inception-v4模型 + +Inception-v4模型可以通过下面的代码获取: + +```python +out = inception_v4.inception_v4(image, class_dim=CLASS_DIM) +``` + ### 定义损失函数 ```python diff --git a/image_classification/inception_v4.py b/image_classification/inception_v4.py new file mode 100644 index 0000000000000000000000000000000000000000..9a8c5fa87bee9b5e41c85e334a18b6907bba4d86 --- /dev/null +++ b/image_classification/inception_v4.py @@ -0,0 +1,526 @@ +import paddle.v2 as paddle + +__all__ = ['inception_v4'] + + +def img_conv(name, + input, + num_filters, + filter_size, + stride, + padding, + num_channels=None): + conv = paddle.layer.img_conv( + name=name, + input=input, + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + act=paddle.activation.Linear()) + norm = paddle.layer.batch_norm( + name=name + '_norm', input=conv, act=paddle.activation.Relu()) + return norm + + +def stem(input): + conv0 = img_conv( + name='stem_conv0', + input=input, + num_channels=3, + num_filters=32, + filter_size=3, + stride=2, + padding=1) + conv1 = img_conv( + name='stem_conv1', + input=conv0, + num_channels=32, + num_filters=32, + filter_size=3, + stride=1, + padding=1) + conv2 = img_conv( + name='stem_conv2', + input=conv1, + num_channels=32, + num_filters=64, + filter_size=3, + stride=1, + padding=1) + + def block0(input): + pool0 = paddle.layer.img_pool( + name='stem_branch0_pool0', + input=input, + num_channels=64, + pool_size=3, + stride=2, + pool_type=paddle.pooling.Max()) + conv0 = img_conv( + name='stem_branch0_conv0', + input=input, + num_channels=64, + num_filters=96, + filter_size=3, + stride=2, + padding=1) + return paddle.layer.concat(input=[pool0, conv0]) + + def block1(input): + l_conv0 = img_conv( + name='stem_branch1_l_conv0', + input=input, + num_channels=160, + num_filters=64, + filter_size=1, + stride=1, + padding=0) + l_conv1 = img_conv( + name='stem_branch1_l_conv1', + input=l_conv0, + num_channels=64, + num_filters=96, + filter_size=3, + stride=1, + padding=1) + r_conv0 = img_conv( + name='stem_branch1_r_conv0', + input=input, + num_channels=160, + num_filters=64, + filter_size=1, + stride=1, + padding=0) + r_conv1 = img_conv( + name='stem_branch1_r_conv1', + input=r_conv0, + num_channels=64, + num_filters=64, + filter_size=(7, 1), + stride=1, + padding=(3, 0)) + r_conv2 = img_conv( + name='stem_branch1_r_conv2', + input=r_conv1, + num_channels=64, + num_filters=64, + filter_size=(1, 7), + stride=1, + padding=(0, 3)) + r_conv3 = img_conv( + name='stem_branch1_r_conv3', + input=r_conv2, + num_channels=64, + num_filters=96, + filter_size=3, + stride=1, + padding=1) + return paddle.layer.concat(input=[l_conv1, r_conv3]) + + def block2(input): + conv0 = img_conv( + name='stem_branch2_conv0', + input=input, + num_channels=192, + num_filters=192, + filter_size=3, + stride=2, + padding=1) + pool0 = paddle.layer.img_pool( + name='stem_branch2_pool0', + input=input, + num_channels=192, + pool_size=3, + stride=2, + pool_type=paddle.pooling.Max()) + return paddle.layer.concat(input=[conv0, pool0]) + + conv3 = block0(conv2) + conv4 = block1(conv3) + conv5 = block2(conv4) + return conv5 + + +def Inception_A(input, depth): + b0_pool0 = paddle.layer.img_pool( + name='inceptA{0}_branch0_pool0'.format(depth), + input=input, + num_channels=384, + pool_size=3, + stride=1, + padding=1, + pool_type=paddle.pooling.Avg()) + b0_conv0 = img_conv( + name='inceptA{0}_branch0_conv0'.format(depth), + input=b0_pool0, + num_channels=384, + num_filters=96, + filter_size=1, + stride=1, + padding=0) + b1_conv0 = img_conv( + name='inceptA{0}_branch1_conv0'.format(depth), + input=input, + num_channels=384, + num_filters=96, + filter_size=1, + stride=1, + padding=0) + b2_conv0 = img_conv( + name='inceptA{0}_branch2_conv0'.format(depth), + input=input, + num_channels=384, + num_filters=64, + filter_size=1, + stride=1, + padding=0) + b2_conv1 = img_conv( + name='inceptA{0}_branch2_conv1'.format(depth), + input=b2_conv0, + num_channels=64, + num_filters=96, + filter_size=3, + stride=1, + padding=1) + b3_conv0 = img_conv( + name='inceptA{0}_branch3_conv0'.format(depth), + input=input, + num_channels=384, + num_filters=64, + filter_size=1, + stride=1, + padding=0) + b3_conv1 = img_conv( + name='inceptA{0}_branch3_conv1'.format(depth), + input=b3_conv0, + num_channels=64, + num_filters=96, + filter_size=3, + stride=1, + padding=1) + b3_conv2 = img_conv( + name='inceptA{0}_branch3_conv2'.format(depth), + input=b3_conv1, + num_channels=96, + num_filters=96, + filter_size=3, + stride=1, + padding=1) + return paddle.layer.concat(input=[b0_conv0, b1_conv0, b2_conv1, b3_conv2]) + + +def Inception_B(input, depth): + b0_pool0 = paddle.layer.img_pool( + name='inceptB{0}_branch0_pool0'.format(depth), + input=input, + num_channels=1024, + pool_size=3, + stride=1, + padding=1, + pool_type=paddle.pooling.Avg()) + b0_conv0 = img_conv( + name='inceptB{0}_branch0_conv0'.format(depth), + input=b0_pool0, + num_channels=1024, + num_filters=128, + filter_size=1, + stride=1, + padding=0) + b1_conv0 = img_conv( + name='inceptB{0}_branch1_conv0'.format(depth), + input=input, + num_channels=1024, + num_filters=384, + filter_size=1, + stride=1, + padding=0) + b2_conv0 = img_conv( + name='inceptB{0}_branch2_conv0'.format(depth), + input=input, + num_channels=1024, + num_filters=192, + filter_size=1, + stride=1, + padding=0) + b2_conv1 = img_conv( + name='inceptB{0}_branch2_conv1'.format(depth), + input=b2_conv0, + num_channels=192, + num_filters=224, + filter_size=(1, 7), + stride=1, + padding=(0, 3)) + b2_conv2 = img_conv( + name='inceptB{0}_branch2_conv2'.format(depth), + input=b2_conv1, + num_channels=224, + num_filters=256, + filter_size=(7, 1), + stride=1, + padding=(3, 0)) + b3_conv0 = img_conv( + name='inceptB{0}_branch3_conv0'.format(depth), + input=input, + num_channels=1024, + num_filters=192, + filter_size=1, + stride=1, + padding=0) + b3_conv1 = img_conv( + name='inceptB{0}_branch3_conv1'.format(depth), + input=b3_conv0, + num_channels=192, + num_filters=192, + filter_size=(1, 7), + stride=1, + padding=(0, 3)) + b3_conv2 = img_conv( + name='inceptB{0}_branch3_conv2'.format(depth), + input=b3_conv1, + num_channels=192, + num_filters=224, + filter_size=(7, 1), + stride=1, + padding=(3, 0)) + b3_conv3 = img_conv( + name='inceptB{0}_branch3_conv3'.format(depth), + input=b3_conv2, + num_channels=224, + num_filters=224, + filter_size=(1, 7), + stride=1, + padding=(0, 3)) + b3_conv4 = img_conv( + name='inceptB{0}_branch3_conv4'.format(depth), + input=b3_conv3, + num_channels=224, + num_filters=256, + filter_size=(7, 1), + stride=1, + padding=(3, 0)) + return paddle.layer.concat(input=[b0_conv0, b1_conv0, b2_conv2, b3_conv4]) + + +def Inception_C(input, depth): + b0_pool0 = paddle.layer.img_pool( + name='inceptC{0}_branch0_pool0'.format(depth), + input=input, + num_channels=1536, + pool_size=3, + stride=1, + padding=1, + pool_type=paddle.pooling.Avg()) + b0_conv0 = img_conv( + name='inceptC{0}_branch0_conv0'.format(depth), + input=b0_pool0, + num_channels=1536, + num_filters=256, + filter_size=1, + stride=1, + padding=0) + b1_conv0 = img_conv( + name='inceptC{0}_branch1_conv0'.format(depth), + input=input, + num_channels=1536, + num_filters=256, + filter_size=1, + stride=1, + padding=0) + b2_conv0 = img_conv( + name='inceptC{0}_branch2_conv0'.format(depth), + input=input, + num_channels=1536, + num_filters=384, + filter_size=1, + stride=1, + padding=0) + b2_conv1 = img_conv( + name='inceptC{0}_branch2_conv1'.format(depth), + input=b2_conv0, + num_channels=384, + num_filters=256, + filter_size=(1, 3), + stride=1, + padding=(0, 1)) + b2_conv2 = img_conv( + name='inceptC{0}_branch2_conv2'.format(depth), + input=b2_conv0, + num_channels=384, + num_filters=256, + filter_size=(3, 1), + stride=1, + padding=(1, 0)) + b3_conv0 = img_conv( + name='inceptC{0}_branch3_conv0'.format(depth), + input=input, + num_channels=1536, + num_filters=384, + filter_size=1, + stride=1, + padding=0) + b3_conv1 = img_conv( + name='inceptC{0}_branch3_conv1'.format(depth), + input=b3_conv0, + num_channels=384, + num_filters=448, + filter_size=(1, 3), + stride=1, + padding=(0, 1)) + b3_conv2 = img_conv( + name='inceptC{0}_branch3_conv2'.format(depth), + input=b3_conv1, + num_channels=448, + num_filters=512, + filter_size=(3, 1), + stride=1, + padding=(1, 0)) + b3_conv3 = img_conv( + name='inceptC{0}_branch3_conv3'.format(depth), + input=b3_conv2, + num_channels=512, + num_filters=256, + filter_size=(3, 1), + stride=1, + padding=(1, 0)) + b3_conv4 = img_conv( + name='inceptC{0}_branch3_conv4'.format(depth), + input=b3_conv2, + num_channels=512, + num_filters=256, + filter_size=(1, 3), + stride=1, + padding=(0, 1)) + return paddle.layer.concat( + input=[b0_conv0, b1_conv0, b2_conv1, b2_conv2, b3_conv3, b3_conv4]) + + +def Reduction_A(input): + b0_pool0 = paddle.layer.img_pool( + name='ReductA_branch0_pool0', + input=input, + num_channels=384, + pool_size=3, + stride=2, + pool_type=paddle.pooling.Max()) + b1_conv0 = img_conv( + name='ReductA_branch1_conv0', + input=input, + num_channels=384, + num_filters=384, + filter_size=3, + stride=2, + padding=1) + b2_conv0 = img_conv( + name='ReductA_branch2_conv0', + input=input, + num_channels=384, + num_filters=192, + filter_size=1, + stride=1, + padding=0) + b2_conv1 = img_conv( + name='ReductA_branch2_conv1', + input=b2_conv0, + num_channels=192, + num_filters=224, + filter_size=3, + stride=1, + padding=1) + b2_conv2 = img_conv( + name='ReductA_branch2_conv2', + input=b2_conv1, + num_channels=224, + num_filters=256, + filter_size=3, + stride=2, + padding=1) + return paddle.layer.concat(input=[b0_pool0, b1_conv0, b2_conv2]) + + +def Reduction_B(input): + b0_pool0 = paddle.layer.img_pool( + name='ReductB_branch0_pool0', + input=input, + num_channels=1024, + pool_size=3, + stride=2, + pool_type=paddle.pooling.Max()) + b1_conv0 = img_conv( + name='ReductB_branch1_conv0', + input=input, + num_channels=1024, + num_filters=192, + filter_size=1, + stride=1, + padding=0) + b1_conv1 = img_conv( + name='ReductB_branch1_conv1', + input=b1_conv0, + num_channels=192, + num_filters=192, + filter_size=3, + stride=2, + padding=1) + b2_conv0 = img_conv( + name='ReductB_branch2_conv0', + input=input, + num_channels=1024, + num_filters=256, + filter_size=1, + stride=1, + padding=0) + b2_conv1 = img_conv( + name='ReductB_branch2_conv1', + input=b2_conv0, + num_channels=256, + num_filters=256, + filter_size=(1, 7), + stride=1, + padding=(0, 3)) + b2_conv2 = img_conv( + name='ReductB_branch2_conv2', + input=b2_conv1, + num_channels=256, + num_filters=320, + filter_size=(7, 1), + stride=1, + padding=(3, 0)) + b2_conv3 = img_conv( + name='ReductB_branch2_conv3', + input=b2_conv2, + num_channels=320, + num_filters=320, + filter_size=3, + stride=2, + padding=1) + return paddle.layer.concat(input=[b0_pool0, b1_conv1, b2_conv3]) + + +def inception_v4(input, class_dim): + conv = stem(input) + + for i in range(4): + conv = Inception_A(conv, i) + conv = Reduction_A(conv) + for i in range(7): + conv = Inception_B(conv, i) + conv = Reduction_B(conv) + for i in range(3): + conv = Inception_C(conv, i) + + pool = paddle.layer.img_pool( + name='incept_avg_pool', + input=conv, + num_channels=1536, + pool_size=7, + stride=1, + pool_type=paddle.pooling.Avg()) + drop = paddle.layer.dropout(input=pool, dropout_rate=0.2) + out = paddle.layer.fc( + name='incept_fc', + input=drop, + size=class_dim, + act=paddle.activation.Softmax()) + return out diff --git a/image_classification/infer.py b/image_classification/infer.py index 659c4f2a8e997c9060934cb81c783dd900854d5e..e7a0b98ef04ec2f3a00f640baaa88c51773fafac 100644 --- a/image_classification/infer.py +++ b/image_classification/infer.py @@ -5,6 +5,7 @@ import vgg import resnet import alexnet import googlenet +import inception_v4 import argparse import os from PIL import Image @@ -26,7 +27,10 @@ def main(): parser.add_argument( 'model', help='The model for image classification', - choices=['alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet']) + choices=[ + 'alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet', + 'inception_v4' + ]) parser.add_argument( 'params_path', help='The file which stores the parameters') args = parser.parse_args() @@ -49,6 +53,8 @@ def main(): out = resnet.resnet_imagenet(image, class_dim=CLASS_DIM) elif args.model == 'googlenet': out, _, _ = googlenet.googlenet(image, class_dim=CLASS_DIM) + elif args.model == 'inception_v4': + out = inception_v4.inception_v4(image, class_dim=CLASS_DIM) # load parameters with gzip.open(args.params_path, 'r') as f: diff --git a/image_classification/train.py b/image_classification/train.py index 12a582db3a28f8e136a87d29ca09be4b8b4782cb..8f343f97fc6fa004edf4cc45c6d48996d8249011 100644 --- a/image_classification/train.py +++ b/image_classification/train.py @@ -6,6 +6,7 @@ import vgg import resnet import alexnet import googlenet +import inception_v4 import argparse DATA_DIM = 3 * 224 * 224 @@ -19,7 +20,10 @@ def main(): parser.add_argument( 'model', help='The model for image classification', - choices=['alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet']) + choices=[ + 'alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet', + 'inception_v4' + ]) args = parser.parse_args() # PaddlePaddle init @@ -52,6 +56,8 @@ def main(): input=out2, label=lbl, coeff=0.3) paddle.evaluator.classification_error(input=out2, label=lbl) extra_layers = [loss1, loss2] + elif args.model == 'inception_v4': + out = inception_v4.inception_v4(image, class_dim=CLASS_DIM) cost = paddle.layer.classification_cost(input=out, label=lbl)