diff --git a/cyclegan/README.md b/cyclegan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ef35c3ab1b3ec53ca2f9b7d6ba28f210b6d36e91 --- /dev/null +++ b/cyclegan/README.md @@ -0,0 +1,139 @@ +# Cycle GAN +--- +## 内容 + +- [安装](#安装) +- [简介](#简介) +- [代码结构](#代码结构) +- [数据准备](#数据准备) +- [模型训练与预测](#模型训练与预测) + +## 安装 + +运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。 + +## 简介 +Cycle GAN 是一种image to image 的图像生成网络,实现了非对称图像数据集的生成和风格迁移。模型结构如下图所示,我们的模型包含两个生成网络 G: X → Y 和 F: Y → X,以及相关的判别器 DY 和 DX 。通过训练DY,使G将X图尽量转换为Y图,反之亦然。同时引入两个“周期一致性损失”,它们保证:如果我们从一个领域转换到另一个领域,它还可以被转换回去:(b)正向循环一致性损失:x→G(x)→F(G(x))≈x, (c)反向循环一致性损失:y→F(y)→G(F(y))≈y + +

+
+图1.网络结构 +

+ + +## 代码结构 +``` +├── data.py # 读取、处理数据。 +├── layers.py # 封装定义基础的layers。 +├── cyclegan.py # 定义基础生成网络和判别网络。 +├── train.py # 训练脚本。 +└── infer.py # 预测脚本。 +``` + + +## 数据准备 + +CycleGAN 支持的数据集可以参考download.py中的`cycle_pix_dataset`,可以通过指定`python download.py --dataset xxx` 下载得到。 + +由于版权问题,cityscapes 数据集无法通过脚本直接获得,需要从[官方](https://www.cityscapes-dataset.com/)下载数据, +下载完之后执行`python prepare_cityscapes_dataset.py --gtFine_dir ./gtFine/ --leftImg8bit_dir ./leftImg8bit --output_dir ./data/cityscapes/`处理, +将数据存放在`data/cityscapes`。 + +数据下载处理完毕后,需要您将数据组织为以下路径结构: +``` +data +|-- cityscapes +| |-- testA +| |-- testB +| |-- trainA +| |-- trainB + +``` + +然后运行txt生成脚本:`python generate_txt.py`,最终数据组织如下所示: +``` +data +|-- cityscapes +| |-- testA +| |-- testA.txt +| |-- testB +| |-- testB.txt +| |-- trainA +| |-- trainA.txt +| |-- trainB +| `-- trainB.txt + +``` + +以上数据文件中,`data`文件夹需要放在训练脚本`train.py`同级目录下。`testA`为存放真实街景图片的文件夹,`testB`为存放语义分割图片的文件夹,`testA.txt`和`testB.txt`分别为测试图片路径列表文件,格式如下: + +``` +data/cityscapes/testA/234_A.jpg +data/cityscapes/testA/292_A.jpg +data/cityscapes/testA/412_A.jpg +``` + +训练数据组织方式与测试数据相同。 + + +## 模型训练与预测 + +### 训练 + +在GPU单卡上训练: + +``` +env CUDA_VISIBLE_DEVICES=0 python train.py +``` + +执行`python train.py --help`可查看更多使用方式和参数详细说明。 + +图1为训练152轮的训练损失示意图,其中横坐标轴为训练轮数,纵轴为在训练集上的损失。其中,'g_loss','da_loss'和'db_loss'分别为生成器、判别器A和判别器B的训练损失。 + + +### 测试 + +执行以下命令可以选择已保存的训练权重,对测试集进行测试,通过 `--epoch` 制定权重轮次: + +``` +env CUDA_VISIBLE_DEVICES=0 python test.py --init_model=checkpoint/199 +``` +生成结果在 `output/eval`中 + + +### 预测 + +执行以下命令读取单张或多张图片进行预测: + +真实街景生成分割图像: + +``` +env CUDA_VISIBLE_DEVICES=0 python infer.py \ + --init_model="./checkpoints/199" --input="./image/testA/123_A.jpg" \ + --input_style=A +``` + +分割图像生成真实街景: + +``` +env CUDA_VISIBLE_DEVICES=0 python infer.py \ + --init_model="checkpoints/199" --input="./image/testB/78_B.jpg" \ + --input_style=B +``` +生成结果在 `output/single`中 + +训练180轮的模型预测效果如fakeA和fakeB所示: + + +

+
+A2B +

+ + +

+
+B2A +

+ +>在本文示例中,均可通过修改`CUDA_VISIBLE_DEVICES`改变使用的显卡号。 diff --git a/cyclegan/__init__.py b/cyclegan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/cyclegan/check.py b/cyclegan/check.py new file mode 100644 index 0000000000000000000000000000000000000000..79ab4862d3c2082c36039b047be08d4a4b5dcedd --- /dev/null +++ b/cyclegan/check.py @@ -0,0 +1,58 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle.fluid as fluid + +__all__ = ['check_gpu', 'check_version'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not fluid.is_compiled_with_cuda(): + print(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.7.0') + except Exception as e: + print(err) + sys.exit(1) diff --git a/cyclegan/cyclegan.py b/cyclegan/cyclegan.py new file mode 100644 index 0000000000000000000000000000000000000000..6fdd21c1bdf41a8ed3b6743297b99ef239bd5543 --- /dev/null +++ b/cyclegan/cyclegan.py @@ -0,0 +1,232 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from layers import ConvBN, DeConvBN +import paddle.fluid as fluid +from model import Model, Loss + + +class ResnetBlock(fluid.dygraph.Layer): + def __init__(self, dim, dropout=False): + super(ResnetBlock, self).__init__() + self.dropout = dropout + self.conv0 = ConvBN(dim, dim, 3, 1) + self.conv1 = ConvBN(dim, dim, 3, 1, act=None) + + def forward(self, inputs): + out_res = fluid.layers.pad2d(inputs, [1, 1, 1, 1], mode="reflect") + out_res = self.conv0(out_res) + if self.dropout: + out_res = fluid.layers.dropout(out_res, dropout_prob=0.5) + out_res = fluid.layers.pad2d(out_res, [1, 1, 1, 1], mode="reflect") + out_res = self.conv1(out_res) + return out_res + inputs + + +class ResnetGenerator(fluid.dygraph.Layer): + def __init__(self, input_channel, n_blocks=9, dropout=False): + super(ResnetGenerator, self).__init__() + + self.conv0 = ConvBN(input_channel, 32, 7, 1) + self.conv1 = ConvBN(32, 64, 3, 2, padding=1) + self.conv2 = ConvBN(64, 128, 3, 2, padding=1) + + dim = 128 + self.resnet_blocks = [] + for i in range(n_blocks): + block = self.add_sublayer("generator_%d" % (i + 1), + ResnetBlock(dim, dropout)) + self.resnet_blocks.append(block) + + self.deconv0 = DeConvBN( + dim, 32 * 2, 3, 2, padding=[1, 1], outpadding=[0, 1, 0, 1]) + self.deconv1 = DeConvBN( + 32 * 2, 32, 3, 2, padding=[1, 1], outpadding=[0, 1, 0, 1]) + + self.conv3 = ConvBN( + 32, input_channel, 7, 1, norm=False, act=False, use_bias=True) + + def forward(self, inputs): + pad_input = fluid.layers.pad2d(inputs, [3, 3, 3, 3], mode="reflect") + y = self.conv0(pad_input) + y = self.conv1(y) + y = self.conv2(y) + for resnet_block in self.resnet_blocks: + y = resnet_block(y) + y = self.deconv0(y) + y = self.deconv1(y) + y = fluid.layers.pad2d(y, [3, 3, 3, 3], mode="reflect") + y = self.conv3(y) + y = fluid.layers.tanh(y) + return y + + +class NLayerDiscriminator(fluid.dygraph.Layer): + def __init__(self, input_channel, d_dims=64, d_nlayers=3): + super(NLayerDiscriminator, self).__init__() + self.conv0 = ConvBN( + input_channel, + d_dims, + 4, + 2, + 1, + norm=False, + use_bias=True, + relufactor=0.2) + + nf_mult, nf_mult_prev = 1, 1 + self.conv_layers = [] + for n in range(1, d_nlayers): + nf_mult_prev = nf_mult + nf_mult = min(2**n, 8) + conv = self.add_sublayer( + 'discriminator_%d' % (n), + ConvBN( + d_dims * nf_mult_prev, + d_dims * nf_mult, + 4, + 2, + 1, + relufactor=0.2)) + self.conv_layers.append(conv) + + nf_mult_prev = nf_mult + nf_mult = min(2**d_nlayers, 8) + self.conv4 = ConvBN( + d_dims * nf_mult_prev, d_dims * nf_mult, 4, 1, 1, relufactor=0.2) + self.conv5 = ConvBN( + d_dims * nf_mult, + 1, + 4, + 1, + 1, + norm=False, + act=None, + use_bias=True, + relufactor=0.2) + + def forward(self, inputs): + y = self.conv0(inputs) + for conv in self.conv_layers: + y = conv(y) + y = self.conv4(y) + y = self.conv5(y) + return y + + +class Generator(Model): + def __init__(self, input_channel=3): + super(Generator, self).__init__() + self.g = ResnetGenerator(input_channel) + + def forward(self, input): + fake = self.g(input) + return fake + + +class GeneratorCombine(Model): + def __init__(self, g_AB=None, g_BA=None, d_A=None, d_B=None, + is_train=True): + super(GeneratorCombine, self).__init__() + self.g_AB = g_AB + self.g_BA = g_BA + self.is_train = is_train + if self.is_train: + self.d_A = d_A + self.d_B = d_B + + def forward(self, input_A, input_B): + # Translate images to the other domain + fake_B = self.g_AB(input_A) + fake_A = self.g_BA(input_B) + + # Translate images back to original domain + cyc_A = self.g_BA(fake_B) + cyc_B = self.g_AB(fake_A) + if not self.is_train: + return fake_A, fake_B, cyc_A, cyc_B + + # Identity mapping of images + idt_A = self.g_AB(input_B) + idt_B = self.g_BA(input_A) + + # Discriminators determines validity of translated images + # d_A(g_AB(A)) + valid_A = self.d_A.d(fake_B) + # d_B(g_BA(A)) + valid_B = self.d_B.d(fake_A) + return input_A, input_B, fake_A, fake_B, cyc_A, cyc_B, idt_A, idt_B, valid_A, valid_B + + +class GLoss(Loss): + def __init__(self, lambda_A=10., lambda_B=10., lambda_identity=0.5): + super(GLoss, self).__init__() + self.lambda_A = lambda_A + self.lambda_B = lambda_B + self.lambda_identity = lambda_identity + + def forward(self, outputs, labels=None): + input_A, input_B, fake_A, fake_B, cyc_A, cyc_B, idt_A, idt_B, valid_A, valid_B = outputs + + def mse(a, b): + return fluid.layers.reduce_mean(fluid.layers.square(a - b)) + + def mae(a, b): # L1Loss + return fluid.layers.reduce_mean(fluid.layers.abs(a - b)) + + g_A_loss = mse(valid_A, 1.) + g_B_loss = mse(valid_B, 1.) + g_loss = g_A_loss + g_B_loss + + cyc_A_loss = mae(input_A, cyc_A) * self.lambda_A + cyc_B_loss = mae(input_B, cyc_B) * self.lambda_B + cyc_loss = cyc_A_loss + cyc_B_loss + + idt_loss_A = mae(input_B, idt_A) * (self.lambda_B * + self.lambda_identity) + idt_loss_B = mae(input_A, idt_B) * (self.lambda_A * + self.lambda_identity) + idt_loss = idt_loss_A + idt_loss_B + + loss = cyc_loss + g_loss + idt_loss + return loss + + +class Discriminator(Model): + def __init__(self, input_channel=3): + super(Discriminator, self).__init__() + self.d = NLayerDiscriminator(input_channel) + + def forward(self, real, fake): + pred_real = self.d(real) + pred_fake = self.d(fake) + return pred_real, pred_fake + + +class DLoss(Loss): + def __init__(self): + super(DLoss, self).__init__() + + def forward(self, inputs, labels=None): + pred_real, pred_fake = inputs + loss = fluid.layers.square(pred_fake) + fluid.layers.square(pred_real - + 1.) + loss = fluid.layers.reduce_mean(loss / 2.0) + return loss diff --git a/cyclegan/data.py b/cyclegan/data.py new file mode 100644 index 0000000000000000000000000000000000000000..effa4eeee12a7a4905f3cc40687d8349601bc6c6 --- /dev/null +++ b/cyclegan/data.py @@ -0,0 +1,121 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import random +import numpy as np +from PIL import Image, ImageOps + +DATASET = "cityscapes" +A_LIST_FILE = "./data/" + DATASET + "/trainA.txt" +B_LIST_FILE = "./data/" + DATASET + "/trainB.txt" +A_TEST_LIST_FILE = "./data/" + DATASET + "/testA.txt" +B_TEST_LIST_FILE = "./data/" + DATASET + "/testB.txt" +IMAGES_ROOT = "./data/" + DATASET + "/" + +import paddle.fluid as fluid + + +class Cityscapes(fluid.io.Dataset): + def __init__(self, root_path, file_path, mode='train', return_name=False): + self.root_path = root_path + self.file_path = file_path + self.mode = mode + self.return_name = return_name + self.images = [root_path + l for l in open(file_path, 'r').readlines()] + + def _train(self, image): + ## Resize + image = image.resize((286, 286), Image.BICUBIC) + ## RandomCrop + i = np.random.randint(0, 30) + j = np.random.randint(0, 30) + image = image.crop((i, j, i + 256, j + 256)) + # RandomHorizontalFlip + if np.random.rand() > 0.5: + image = ImageOps.mirror(image) + return image + + def __getitem__(self, idx): + f = self.images[idx].strip("\n\r\t ") + image = Image.open(f) + if self.mode == 'train': + image = self._train(image) + else: + image = image.resize((256, 256), Image.BICUBIC) + # ToTensor + image = np.array(image).transpose([2, 0, 1]).astype('float32') + image = image / 255.0 + # Normalize, mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5] + image = (image - 0.5) / 0.5 + if self.return_name: + return [image], os.path.basename(f) + else: + return [image] + + def __len__(self): + return len(self.images) + + +def DataA(root=IMAGES_ROOT, fpath=A_LIST_FILE): + """ + Reader of images with A style for training. + """ + return Cityscapes(root, fpath) + + +def DataB(root=IMAGES_ROOT, fpath=B_LIST_FILE): + """ + Reader of images with B style for training. + """ + return Cityscapes(root, fpath) + + +def TestDataA(root=IMAGES_ROOT, fpath=A_TEST_LIST_FILE): + """ + Reader of images with A style for training. + """ + return Cityscapes(root, fpath, mode='test', return_name=True) + + +def TestDataB(root=IMAGES_ROOT, fpath=B_TEST_LIST_FILE): + """ + Reader of images with B style for training. + """ + return Cityscapes(root, fpath, mode='test', return_name=True) + + +class ImagePool(object): + def __init__(self, pool_size=50): + self.pool = [] + self.count = 0 + self.pool_size = pool_size + + def get(self, image): + if self.count < self.pool_size: + self.pool.append(image) + self.count += 1 + return image + else: + p = random.random() + if p > 0.5: + random_id = random.randint(0, self.pool_size - 1) + temp = self.pool[random_id] + self.pool[random_id] = image + return temp + else: + return image diff --git a/cyclegan/image/A2B.png b/cyclegan/image/A2B.png new file mode 100644 index 0000000000000000000000000000000000000000..b67466da9bdf04344ac6a8f417169414641be664 Binary files /dev/null and b/cyclegan/image/A2B.png differ diff --git a/cyclegan/image/B2A.png b/cyclegan/image/B2A.png new file mode 100644 index 0000000000000000000000000000000000000000..851dd7422144a12cbdc25c47229c4db3ed727120 Binary files /dev/null and b/cyclegan/image/B2A.png differ diff --git a/cyclegan/image/net.png b/cyclegan/image/net.png new file mode 100644 index 0000000000000000000000000000000000000000..46681f8eea98995deeb03fb90257451a6fdfcdf8 Binary files /dev/null and b/cyclegan/image/net.png differ diff --git a/cyclegan/image/testA/123_A.jpg b/cyclegan/image/testA/123_A.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c78de45861aa3afaa33ecaeb6f72e444a8391987 Binary files /dev/null and b/cyclegan/image/testA/123_A.jpg differ diff --git a/cyclegan/image/testB/78_B.jpg b/cyclegan/image/testB/78_B.jpg new file mode 100644 index 0000000000000000000000000000000000000000..849c3be3ce6bd94cf38b1e2e40725727949c2a75 Binary files /dev/null and b/cyclegan/image/testB/78_B.jpg differ diff --git a/cyclegan/infer.py b/cyclegan/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..0b61a958d59e19b73fa01d3c484e1e3231fae71b --- /dev/null +++ b/cyclegan/infer.py @@ -0,0 +1,108 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import glob +import numpy as np +import argparse + +from PIL import Image +from scipy.misc import imsave + +import paddle.fluid as fluid +from check import check_gpu, check_version + +from model import Model, Input, set_device +from cyclegan import Generator, GeneratorCombine + + +def main(): + place = set_device(FLAGS.device) + fluid.enable_dygraph(place) if FLAGS.dynamic else None + + # Generators + g_AB = Generator() + g_BA = Generator() + g = GeneratorCombine(g_AB, g_BA, is_train=False) + + im_shape = [-1, 3, 256, 256] + input_A = Input(im_shape, 'float32', 'input_A') + input_B = Input(im_shape, 'float32', 'input_B') + g.prepare(inputs=[input_A, input_B]) + g.load(FLAGS.init_model, skip_mismatch=True, reset_optimizer=True) + + out_path = FLAGS.output + "/single" + if not os.path.exists(out_path): + os.makedirs(out_path) + for f in glob.glob(FLAGS.input): + image_name = os.path.basename(f) + image = Image.open(f).convert('RGB') + image = image.resize((256, 256), Image.BICUBIC) + image = np.array(image) / 127.5 - 1 + + image = image[:, :, 0:3].astype("float32") + data = image.transpose([2, 0, 1])[np.newaxis, :] + + if FLAGS.input_style == "A": + _, fake, _, _ = g.test([data, data]) + + if FLAGS.input_style == "B": + fake, _, _, _ = g.test([data, data]) + + fake = np.squeeze(fake[0]).transpose([1, 2, 0]) + + opath = "{}/fake{}{}".format(out_path, FLAGS.input_style, image_name) + imsave(opath, ((fake + 1) * 127.5).astype(np.uint8)) + print("transfer {} to {}".format(f, opath)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("CycleGAN inference") + parser.add_argument( + "-d", "--dynamic", action='store_false', help="Enable dygraph mode") + parser.add_argument( + "-p", + "--device", + type=str, + default='gpu', + help="device to use, gpu or cpu") + parser.add_argument( + "-i", + "--input", + type=str, + default='./image/testA/123_A.jpg', + help="input image") + parser.add_argument( + "-o", + '--output', + type=str, + default='output', + help="The test result to be saved to.") + parser.add_argument( + "-m", + "--init_model", + type=str, + default='checkpoint/199', + help="The init model file of directory.") + parser.add_argument( + "-s", "--input_style", type=str, default='A', help="A or B") + FLAGS = parser.parse_args() + print(FLAGS) + check_gpu(str.lower(FLAGS.device) == 'gpu') + check_version() + main() diff --git a/cyclegan/layers.py b/cyclegan/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..8c79ef5ff541646d98c3ce26d1d9a1888dc4421c --- /dev/null +++ b/cyclegan/layers.py @@ -0,0 +1,140 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, BatchNorm + +# cudnn is not better when batch size is 1. +use_cudnn = False +import numpy as np + + +class ConvBN(fluid.dygraph.Layer): + """docstring for Conv2D""" + + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=0, + stddev=0.02, + norm=True, + is_test=False, + act='leaky_relu', + relufactor=0.0, + use_bias=False): + super(ConvBN, self).__init__() + + pattr = fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=stddev)) + self.conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + use_cudnn=use_cudnn, + param_attr=pattr, + bias_attr=use_bias) + if norm: + self.bn = BatchNorm( + num_filters, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer(1.0, + 0.02)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.0)), + is_test=False, + trainable_statistics=True) + self.relufactor = relufactor + self.norm = norm + self.act = act + + def forward(self, inputs): + conv = self.conv(inputs) + if self.norm: + conv = self.bn(conv) + + if self.act == 'leaky_relu': + conv = fluid.layers.leaky_relu(conv, alpha=self.relufactor) + elif self.act == 'relu': + conv = fluid.layers.relu(conv) + else: + conv = conv + + return conv + + +class DeConvBN(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=[0, 0], + outpadding=[0, 0, 0, 0], + stddev=0.02, + act='leaky_relu', + norm=True, + is_test=False, + relufactor=0.0, + use_bias=False): + super(DeConvBN, self).__init__() + + pattr = fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=stddev)) + self._deconv = Conv2DTranspose( + num_channels, + num_filters, + filter_size=filter_size, + stride=stride, + padding=padding, + param_attr=pattr, + bias_attr=use_bias) + if norm: + self.bn = BatchNorm( + num_filters, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NormalInitializer(1.0, + 0.02)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.0)), + is_test=False, + trainable_statistics=True) + self.outpadding = outpadding + self.relufactor = relufactor + self.use_bias = use_bias + self.norm = norm + self.act = act + + def forward(self, inputs): + conv = self._deconv(inputs) + conv = fluid.layers.pad2d( + conv, paddings=self.outpadding, mode='constant', pad_value=0.0) + + if self.norm: + conv = self.bn(conv) + + if self.act == 'leaky_relu': + conv = fluid.layers.leaky_relu(conv, alpha=self.relufactor) + elif self.act == 'relu': + conv = fluid.layers.relu(conv) + else: + conv = conv + + return conv diff --git a/cyclegan/test.py b/cyclegan/test.py new file mode 100644 index 0000000000000000000000000000000000000000..995663090f07e345e54be47da26a8c0e7fd32a4a --- /dev/null +++ b/cyclegan/test.py @@ -0,0 +1,103 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import argparse +import numpy as np +from scipy.misc import imsave + +import paddle.fluid as fluid +from check import check_gpu, check_version + +from model import Model, Input, set_device +from cyclegan import Generator, GeneratorCombine +import data as data + + +def main(): + place = set_device(FLAGS.device) + fluid.enable_dygraph(place) if FLAGS.dynamic else None + + # Generators + g_AB = Generator() + g_BA = Generator() + g = GeneratorCombine(g_AB, g_BA, is_train=False) + + im_shape = [-1, 3, 256, 256] + input_A = Input(im_shape, 'float32', 'input_A') + input_B = Input(im_shape, 'float32', 'input_B') + g.prepare(inputs=[input_A, input_B]) + g.load(FLAGS.init_model, skip_mismatch=True, reset_optimizer=True) + + if not os.path.exists(FLAGS.output): + os.makedirs(FLAGS.output) + + test_data_A = data.TestDataA() + test_data_B = data.TestDataB() + + for i in range(len(test_data_A)): + data_A, A_name = test_data_A[i] + data_B, B_name = test_data_B[i] + data_A = np.array(data_A).astype("float32") + data_B = np.array(data_B).astype("float32") + + fake_A, fake_B, cyc_A, cyc_B = g.test([data_A, data_B]) + + datas = [fake_A, fake_B, cyc_A, cyc_B, data_A, data_B] + odatas = [] + for o in datas: + d = np.squeeze(o[0]).transpose([1, 2, 0]) + im = ((d + 1) * 127.5).astype(np.uint8) + odatas.append(im) + imsave(FLAGS.output + "/fakeA_" + B_name, odatas[0]) + imsave(FLAGS.output + "/fakeB_" + A_name, odatas[1]) + imsave(FLAGS.output + "/cycA_" + A_name, odatas[2]) + imsave(FLAGS.output + "/cycB_" + B_name, odatas[3]) + imsave(FLAGS.output + "/inputA_" + A_name, odatas[4]) + imsave(FLAGS.output + "/inputB_" + B_name, odatas[5]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("CycleGAN test") + parser.add_argument( + "-d", "--dynamic", action='store_false', help="Enable dygraph mode") + parser.add_argument( + "-p", + "--device", + type=str, + default='gpu', + help="device to use, gpu or cpu") + parser.add_argument( + "-b", "--batch_size", default=1, type=int, help="batch size") + parser.add_argument( + "-o", + '--output', + type=str, + default='output/eval', + help="The test result to be saved to.") + parser.add_argument( + "-m", + "--init_model", + type=str, + default='checkpoint/199', + help="The init model file of directory.") + FLAGS = parser.parse_args() + print(FLAGS) + check_gpu(str.lower(FLAGS.device) == 'gpu') + check_version() + main() diff --git a/cyclegan/train.py b/cyclegan/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c2203fc19c8e0381fa27bde26a22a863130532e9 --- /dev/null +++ b/cyclegan/train.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import random +import argparse +import contextlib +import time + +import paddle +import paddle.fluid as fluid +from check import check_gpu, check_version + +from model import Model, Input, set_device + +import data as data +from cyclegan import Generator, Discriminator, GeneratorCombine, GLoss, DLoss + +step_per_epoch = 2974 + + +def opt(parameters): + lr_base = 0.0002 + bounds = [100, 120, 140, 160, 180] + lr = [1., 0.8, 0.6, 0.4, 0.2, 0.1] + bounds = [i * step_per_epoch for i in bounds] + lr = [i * lr_base for i in lr] + optimizer = fluid.optimizer.Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bounds, values=lr), + parameter_list=parameters, + beta1=0.5) + return optimizer + + +def main(): + place = set_device(FLAGS.device) + fluid.enable_dygraph(place) if FLAGS.dynamic else None + + # Generators + g_AB = Generator() + g_BA = Generator() + + # Discriminators + d_A = Discriminator() + d_B = Discriminator() + + g = GeneratorCombine(g_AB, g_BA, d_A, d_B) + + da_params = d_A.parameters() + db_params = d_B.parameters() + g_params = g_AB.parameters() + g_BA.parameters() + + da_optimizer = opt(da_params) + db_optimizer = opt(db_params) + g_optimizer = opt(g_params) + + im_shape = [None, 3, 256, 256] + input_A = Input(im_shape, 'float32', 'input_A') + input_B = Input(im_shape, 'float32', 'input_B') + fake_A = Input(im_shape, 'float32', 'fake_A') + fake_B = Input(im_shape, 'float32', 'fake_B') + + g_AB.prepare(inputs=[input_A]) + g_BA.prepare(inputs=[input_B]) + + g.prepare(g_optimizer, GLoss(), inputs=[input_A, input_B]) + d_A.prepare(da_optimizer, DLoss(), inputs=[input_B, fake_B]) + d_B.prepare(db_optimizer, DLoss(), inputs=[input_A, fake_A]) + + if FLAGS.resume: + g.load(FLAGS.resume) + + loader_A = fluid.io.DataLoader( + data.DataA(), + places=place, + shuffle=True, + return_list=True, + batch_size=FLAGS.batch_size) + loader_B = fluid.io.DataLoader( + data.DataB(), + places=place, + shuffle=True, + return_list=True, + batch_size=FLAGS.batch_size) + + A_pool = data.ImagePool() + B_pool = data.ImagePool() + + for epoch in range(FLAGS.epoch): + for i, (data_A, data_B) in enumerate(zip(loader_A, loader_B)): + data_A = data_A[0][0] if not FLAGS.dynamic else data_A[0] + data_B = data_B[0][0] if not FLAGS.dynamic else data_B[0] + start = time.time() + + fake_B = g_AB.test(data_A)[0] + fake_A = g_BA.test(data_B)[0] + g_loss = g.train([data_A, data_B])[0] + fake_pb = B_pool.get(fake_B) + da_loss = d_A.train([data_B, fake_pb])[0] + + fake_pa = A_pool.get(fake_A) + db_loss = d_B.train([data_A, fake_pa])[0] + + t = time.time() - start + if i % 20 == 0: + print("epoch: {} | step: {:3d} | g_loss: {:.4f} | " \ + "da_loss: {:.4f} | db_loss: {:.4f} | s/step {:.4f}". + format(epoch, i, g_loss[0], da_loss[0], db_loss[0], t)) + g.save('{}/{}'.format(FLAGS.checkpoint_path, epoch)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("CycleGAN Training on Cityscapes") + parser.add_argument( + "-d", "--dynamic", action='store_false', help="Enable dygraph mode") + parser.add_argument( + "-p", + "--device", + type=str, + default='gpu', + help="device to use, gpu or cpu") + parser.add_argument( + "-e", "--epoch", default=200, type=int, help="Epoch number") + parser.add_argument( + "-b", "--batch_size", default=1, type=int, help="batch size") + parser.add_argument( + "-o", + "--checkpoint_path", + type=str, + default='checkpoint', + help="path to save checkpoint") + parser.add_argument( + "-r", + "--resume", + default=None, + type=str, + help="checkpoint path to resume") + FLAGS = parser.parse_args() + print(FLAGS) + check_gpu(str.lower(FLAGS.device) == 'gpu') + check_version() + main() diff --git a/image_classification/main.py b/image_classification/main.py index 3d8646f94d64b41e9770f277c1eb2c38866273e2..781824fa60f9d703187697825595d81889b9c53c 100644 --- a/image_classification/main.py +++ b/image_classification/main.py @@ -1,3 +1,4 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/lac.py b/lac.py new file mode 100644 index 0000000000000000000000000000000000000000..cdd380686256b2039f6aa7f2289639559969a6a8 --- /dev/null +++ b/lac.py @@ -0,0 +1,728 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +lexical analysis network structure +""" + +from __future__ import division +from __future__ import print_function + +import io +import os +import sys +import math +import argparse +import numpy as np + +from metrics import Metric +from model import Model, Input, Loss, set_device + +import paddle.fluid as fluid +from paddle.fluid.optimizer import AdamOptimizer +from paddle.fluid.initializer import NormalInitializer +from paddle.fluid.dygraph.nn import Embedding, Linear, GRUUnit + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + size, + h_0=None, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + + self.gru_unit = GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + + +class BiGRU(fluid.dygraph.Layer): + def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None): + super(BiGRU, self).__init__() + + self.pre_gru = Linear( + input_dim=input_dim, + output_dim=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.gru = DynamicGRU( + size=grnn_hidden_dim, + h_0=h_0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.pre_gru_r = Linear( + input_dim=input_dim, + output_dim=grnn_hidden_dim * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.gru_r = DynamicGRU( + size=grnn_hidden_dim, + is_reverse=True, + h_0=h_0, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-init_bound, high=init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + def forward(self, input_feature): + res_pre_gru = self.pre_gru(input_feature) + res_gru = self.gru(res_pre_gru) + res_pre_gru_r = self.pre_gru_r(input_feature) + res_gru_r = self.gru_r(res_pre_gru_r) + bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1) + return bi_merge + + +class Linear_chain_crf(fluid.dygraph.Layer): + def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): + super(Linear_chain_crf, self).__init__() + + self._param_attr = param_attr + self._dtype = dtype + self._size = size + self._is_test = is_test + self._transition = self.create_parameter( + attr=self._param_attr, + shape=[self._size + 2, self._size], + dtype=self._dtype) + + @property + def weight(self): + return self._transition + + @weight.setter + def weight(self, value): + self._transition = value + + def forward(self, input, label, length=None): + + alpha = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + emission_exps = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + transition_exps = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + log_likelihood = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + this_inputs = { + "Emission": [input], + "Transition": self._transition, + "Label": [label] + } + if length: + this_inputs['Length'] = [length] + self._helper.append_op( + type='linear_chain_crf', + inputs=this_inputs, + outputs={ + "Alpha": [alpha], + "EmissionExps": [emission_exps], + "TransitionExps": transition_exps, + "LogLikelihood": log_likelihood + }, + attrs={"is_test": self._is_test, }) + return log_likelihood + + +class Crf_decoding(fluid.dygraph.Layer): + def __init__(self, param_attr, size=None, is_test=False, dtype='float32'): + super(Crf_decoding, self).__init__() + + self._dtype = dtype + self._size = size + self._is_test = is_test + self._param_attr = param_attr + self._transition = self.create_parameter( + attr=self._param_attr, + shape=[self._size + 2, self._size], + dtype=self._dtype) + + @property + def weight(self): + return self._transition + + @weight.setter + def weight(self, value): + self._transition = value + + def forward(self, input, label=None, length=None): + + viterbi_path = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + this_inputs = { + "Emission": [input], + "Transition": self._transition, + "Label": label + } + if length: + this_inputs['Length'] = [length] + self._helper.append_op( + type='crf_decoding', + inputs=this_inputs, + outputs={"ViterbiPath": [viterbi_path]}, + attrs={"is_test": self._is_test, }) + return viterbi_path + + +class Chunk_eval(fluid.dygraph.Layer): + def __init__(self, + num_chunk_types, + chunk_scheme, + excluded_chunk_types=None): + super(Chunk_eval, self).__init__() + self.num_chunk_types = num_chunk_types + self.chunk_scheme = chunk_scheme + self.excluded_chunk_types = excluded_chunk_types + + def forward(self, input, label, seq_length=None): + precision = self._helper.create_variable_for_type_inference( + dtype="float32") + recall = self._helper.create_variable_for_type_inference( + dtype="float32") + f1_score = self._helper.create_variable_for_type_inference( + dtype="float32") + num_infer_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_label_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + num_correct_chunks = self._helper.create_variable_for_type_inference( + dtype="int64") + + this_input = {"Inference": input, "Label": label[0]} + if seq_length: + this_input["SeqLength"] = seq_length[0] + self._helper.append_op( + type='chunk_eval', + inputs=this_input, + outputs={ + "Precision": [precision], + "Recall": [recall], + "F1-Score": [f1_score], + "NumInferChunks": [num_infer_chunks], + "NumLabelChunks": [num_label_chunks], + "NumCorrectChunks": [num_correct_chunks] + }, + attrs={ + "num_chunk_types": self.num_chunk_types, + "chunk_scheme": self.chunk_scheme, + "excluded_chunk_types": self.excluded_chunk_types or [] + }) + return (num_infer_chunks, num_label_chunks, num_correct_chunks) + + +class LAC(Model): + def __init__(self, args, vocab_size, num_labels, length=None): + super(LAC, self).__init__() + """ + define the lexical analysis network structure + word: stores the input of the model + for_infer: a boolean value, indicating if the model to be created is for training or predicting. + + return: + for infer: return the prediction + otherwise: return the prediction + """ + self.word_emb_dim = args.word_emb_dim + self.vocab_size = vocab_size + self.num_labels = num_labels + self.grnn_hidden_dim = args.grnn_hidden_dim + self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir( + args) else 1.0 + self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir( + args) else 1.0 + self.bigru_num = args.bigru_num + self.init_bound = 0.1 + + self.word_embedding = Embedding( + size=[self.vocab_size, self.word_emb_dim], + dtype='float32', + param_attr=fluid.ParamAttr( + learning_rate=self.emb_lr, + name="word_emb", + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound))) + + h_0 = fluid.layers.create_global_var( + shape=[args.batch_size, self.grnn_hidden_dim], + value=0.0, + dtype='float32', + persistable=True, + force_cpu=True, + name='h_0') + + self.bigru_units = [] + for i in range(self.bigru_num): + if i == 0: + self.bigru_units.append( + self.add_sublayer( + "bigru_units%d" % i, + BiGRU( + self.grnn_hidden_dim, + self.grnn_hidden_dim, + self.init_bound, + h_0=h_0))) + else: + self.bigru_units.append( + self.add_sublayer( + "bigru_units%d" % i, + BiGRU( + self.grnn_hidden_dim * 2, + self.grnn_hidden_dim, + self.init_bound, + h_0=h_0))) + + self.fc = Linear( + input_dim=self.grnn_hidden_dim * 2, + output_dim=self.num_labels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=-self.init_bound, high=self.init_bound), + regularizer=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-4))) + + self.linear_chain_crf = Linear_chain_crf( + param_attr=fluid.ParamAttr( + name='linear_chain_crfw', learning_rate=self.crf_lr), + size=self.num_labels) + + self.crf_decoding = Crf_decoding( + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=self.crf_lr), + size=self.num_labels) + + def forward(self, word, target, lengths): + """ + Configure the network + """ + word_embed = self.word_embedding(word) + input_feature = word_embed + + for i in range(self.bigru_num): + bigru_output = self.bigru_units[i](input_feature) + input_feature = bigru_output + + emission = self.fc(bigru_output) + + crf_cost = self.linear_chain_crf( + input=emission, label=target, length=lengths) + avg_cost = fluid.layers.mean(x=crf_cost) + self.crf_decoding.weight = self.linear_chain_crf.weight + crf_decode = self.crf_decoding(input=emission, length=lengths) + return crf_decode, avg_cost, lengths + + +class LacLoss(Loss): + def __init__(self): + super(LacLoss, self).__init__() + pass + + def forward(self, outputs, labels): + avg_cost = outputs[1] + return avg_cost + + +class ChunkEval(Metric): + def __init__(self, num_labels, name=None, *args, **kwargs): + super(ChunkEval, self).__init__(*args, **kwargs) + self._init_name(name) + self.chunk_eval = Chunk_eval( + int(math.ceil((num_labels - 1) / 2.0)), "IOB") + self.reset() + + def add_metric_op(self, pred, label, *args, **kwargs): + crf_decode = pred[0] + lengths = pred[2] + (num_infer_chunks, num_label_chunks, + num_correct_chunks) = self.chunk_eval( + input=crf_decode, label=label, seq_length=lengths) + return [num_infer_chunks, num_label_chunks, num_correct_chunks] + + def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks, + *args, **kwargs): + self.infer_chunks_total += num_infer_chunks + self.label_chunks_total += num_label_chunks + self.correct_chunks_total += num_correct_chunks + precision = float( + num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0 + recall = float( + num_correct_chunks) / num_label_chunks if num_label_chunks else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if num_correct_chunks else 0 + return [precision, recall, f1_score] + + def reset(self): + self.infer_chunks_total = 0 + self.label_chunks_total = 0 + self.correct_chunks_total = 0 + + def accumulate(self): + precision = float( + self.correct_chunks_total + ) / self.infer_chunks_total if self.infer_chunks_total else 0 + recall = float( + self.correct_chunks_total + ) / self.label_chunks_total if self.label_chunks_total else 0 + f1_score = float(2 * precision * recall) / ( + precision + recall) if self.correct_chunks_total else 0 + res = [precision, recall, f1_score] + return res + + def _init_name(self, name): + name = name or 'chunk eval' + self._name = ['precision', 'recall', 'F1'] + + def name(self): + return self._name + + +class LacDataset(object): + """ + Load lexical analysis dataset + """ + + def __init__(self, args): + self.word_dict_path = args.word_dict_path + self.label_dict_path = args.label_dict_path + self.word_rep_dict_path = args.word_rep_dict_path + self._load_dict() + + def _load_dict(self): + self.word2id_dict = self.load_kv_dict( + self.word_dict_path, reverse=True, value_func=np.int64) + self.id2word_dict = self.load_kv_dict(self.word_dict_path) + self.label2id_dict = self.load_kv_dict( + self.label_dict_path, reverse=True, value_func=np.int64) + self.id2label_dict = self.load_kv_dict(self.label_dict_path) + if self.word_rep_dict_path is None: + self.word_replace_dict = dict() + else: + self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path) + + def load_kv_dict(self, + dict_path, + reverse=False, + delimiter="\t", + key_func=None, + value_func=None): + """ + Load key-value dict from file + """ + result_dict = {} + for line in io.open(dict_path, "r", encoding='utf8'): + terms = line.strip("\n").split(delimiter) + if len(terms) != 2: + continue + if reverse: + value, key = terms + else: + key, value = terms + if key in result_dict: + raise KeyError("key duplicated with [%s]" % (key)) + if key_func: + key = key_func(key) + if value_func: + value = value_func(value) + result_dict[key] = value + return result_dict + + @property + def vocab_size(self): + return len(self.word2id_dict.values()) + + @property + def num_labels(self): + return len(self.label2id_dict.values()) + + def get_num_examples(self, filename): + """num of line of file""" + return sum(1 for line in io.open(filename, "r", encoding='utf8')) + + def word_to_ids(self, words): + """convert word to word index""" + word_ids = [] + for word in words: + word = self.word_replace_dict.get(word, word) + if word not in self.word2id_dict: + word = "OOV" + word_id = self.word2id_dict[word] + word_ids.append(word_id) + + return word_ids + + def label_to_ids(self, labels): + """convert label to label index""" + label_ids = [] + for label in labels: + if label not in self.label2id_dict: + label = "O" + label_id = self.label2id_dict[label] + label_ids.append(label_id) + return label_ids + + def file_reader(self, + filename, + mode="train", + batch_size=32, + max_seq_len=126): + """ + yield (word_idx, target_idx) one by one from file, + or yield (word_idx, ) in `infer` mode + """ + + def wrapper(): + fread = io.open(filename, "r", encoding="utf-8") + headline = next(fread) + headline = headline.strip().split('\t') + assert len(headline) == 2 and headline[0] == "text_a" and headline[ + 1] == "label" + buf = [] + for line in fread: + words, labels = line.strip("\n").split("\t") + if len(words) < 1: + continue + word_ids = self.word_to_ids(words.split("\002")) + label_ids = self.label_to_ids(labels.split("\002")) + assert len(word_ids) == len(label_ids) + word_ids = word_ids[0:max_seq_len] + words_len = np.int64(len(word_ids)) + word_ids += [0 for _ in range(max_seq_len - words_len)] + label_ids = label_ids[0:max_seq_len] + label_ids += [0 for _ in range(max_seq_len - words_len)] + assert len(word_ids) == len(label_ids) + yield word_ids, label_ids, words_len + fread.close() + + return wrapper + + +def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): + def wrapper(): + batch_words, batch_labels, seq_lens = [], [], [] + for epoch in xrange(args.epoch): + for instance in reader.file_reader( + file_name, mode, max_seq_len=args.max_seq_len)(): + words, labels, words_len = instance + if len(seq_lens) < args.batch_size: + batch_words.append(words) + batch_labels.append(labels) + seq_lens.append(words_len) + if len(seq_lens) == args.batch_size: + yield batch_words, batch_labels, seq_lens, batch_labels + batch_words, batch_labels, seq_lens = [], [], [] + + if len(seq_lens) > 0: + yield batch_words, batch_labels, seq_lens, batch_labels + batch_words, batch_labels, seq_lens = [], [], [] + + return wrapper + + +def create_dataloader(generator, place, feed_list=None): + if not feed_list: + data_loader = fluid.io.DataLoader.from_generator( + capacity=50, + use_double_buffer=True, + iterable=True, + return_list=True) + else: + data_loader = fluid.io.DataLoader.from_generator( + feed_list=feed_list, + capacity=50, + use_double_buffer=True, + iterable=True, + return_list=True) + data_loader.set_batch_generator(generator, places=place) + return data_loader + + +def main(args): + place = set_device(args.device) + fluid.enable_dygraph(place) if args.dynamic else None + + inputs = [ + Input( + [None, args.max_seq_len], 'int64', name='words'), Input( + [None, args.max_seq_len], 'int64', name='target'), Input( + [None], 'int64', name='length') + ] + labels = [Input([None, args.max_seq_len], 'int64', name='labels')] + + feed = [x.forward() for x in inputs + labels] + dataset = LacDataset(args) + train_path = os.path.join(args.data, "train.tsv") + test_path = os.path.join(args.data, "test.tsv") + + if args.dynamic: + feed_list = None + else: + feed_list = feed + train_generator = create_lexnet_data_generator( + args, reader=dataset, file_name=train_path, place=place, mode="train") + test_generator = create_lexnet_data_generator( + args, reader=dataset, file_name=test_path, place=place, mode="test") + + train_dataset = create_dataloader( + train_generator, place, feed_list=feed_list) + test_dataset = create_dataloader( + test_generator, place, feed_list=feed_list) + + vocab_size = dataset.vocab_size + num_labels = dataset.num_labels + model = LAC(args, vocab_size, num_labels) + + optim = AdamOptimizer( + learning_rate=args.base_learning_rate, + parameter_list=model.parameters()) + + model.prepare( + optim, + LacLoss(), + ChunkEval(num_labels), + inputs=inputs, + labels=labels, + device=args.device) + + if args.resume is not None: + model.load(args.resume) + + model.fit(train_dataset, + test_dataset, + epochs=args.epoch, + batch_size=args.batch_size, + eval_freq=args.eval_freq, + save_freq=args.save_freq, + save_dir=args.save_dir) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("LAC training") + parser.add_argument( + "-dir", "--data", default=None, type=str, help='path to LAC dataset') + parser.add_argument( + "-wd", + "--word_dict_path", + default=None, + type=str, + help='word dict path') + parser.add_argument( + "-ld", + "--label_dict_path", + default=None, + type=str, + help='label dict path') + parser.add_argument( + "-wrd", + "--word_rep_dict_path", + default=None, + type=str, + help='The path of the word replacement Dictionary.') + parser.add_argument( + "-dev", + "--device", + type=str, + default='gpu', + help="device to use, gpu or cpu") + parser.add_argument( + "-d", "--dynamic", action='store_true', help="enable dygraph mode") + parser.add_argument( + "-e", "--epoch", default=10, type=int, help="number of epoch") + parser.add_argument( + '-lr', + '--base_learning_rate', + default=1e-3, + type=float, + metavar='LR', + help='initial learning rate') + parser.add_argument( + "--word_emb_dim", + default=128, + type=int, + help='word embedding dimension') + parser.add_argument( + "--grnn_hidden_dim", default=128, type=int, help="hidden dimension") + parser.add_argument( + "--bigru_num", default=2, type=int, help='the number of bi-rnn') + parser.add_argument("-elr", "--emb_learning_rate", default=1.0, type=float) + parser.add_argument("-clr", "--crf_learning_rate", default=1.0, type=float) + parser.add_argument( + "-b", "--batch_size", default=300, type=int, help="batch size") + parser.add_argument( + "--max_seq_len", default=126, type=int, help="max sequence length") + parser.add_argument( + "-n", "--num_devices", default=1, type=int, help="number of devices") + parser.add_argument( + "-r", + "--resume", + default=None, + type=str, + help="checkpoint path to resume") + parser.add_argument( + "-o", + "--save_dir", + default="./model", + type=str, + help="save model path") + parser.add_argument( + "-sf", "--save_freq", default=1, type=int, help="save frequency") + parser.add_argument( + "-ef", "--eval_freq", default=1, type=int, help="eval frequency") + + args = parser.parse_args() + print(args) + main(args) diff --git a/model.py b/model.py index dea21bb98329404d02c10e2a563f21d76f7851e1..e6faeb762cccd6e3fc56b99e265d86fc77691690 100644 --- a/model.py +++ b/model.py @@ -114,9 +114,9 @@ class Loss(object): def forward(self, outputs, labels): raise NotImplementedError() - def __call__(self, outputs, labels): + def __call__(self, outputs, labels=None): labels = to_list(labels) - if in_dygraph_mode(): + if in_dygraph_mode() and labels: labels = [to_variable(l) for l in labels] losses = to_list(self.forward(to_list(outputs), labels)) if self.average: @@ -410,7 +410,8 @@ class StaticGraphAdapter(object): and self.model._optimizer._learning_rate_map: # HACK workaround learning rate map issue lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] - self.model._optimizer._learning_rate_map[prog] = lr_var + new_lr_var = prog.global_block().vars[lr_var.name] + self.model._optimizer._learning_rate_map[prog] = new_lr_var losses = [] metrics = [] @@ -852,8 +853,6 @@ class Model(fluid.dygraph.Layer): if not isinstance(inputs, (list, dict, Input)): raise TypeError( "'inputs' must be list or dict in static graph mode") - if loss_function and not isinstance(labels, (list, Input)): - raise TypeError("'labels' must be list in static graph mode") metrics = metrics or [] for metric in to_list(metrics): @@ -1083,7 +1082,7 @@ class Model(fluid.dygraph.Layer): return eval_result - def predict(self, test_data, batch_size=1, num_workers=0): + def predict(self, test_data, batch_size=1, num_workers=0, stack_outputs=True): """ FIXME: add more comments and usage Args: @@ -1096,6 +1095,12 @@ class Model(fluid.dygraph.Layer): num_workers (int): the number of subprocess to load data, 0 for no subprocess used and loading data in main process. When train_data and eval_data are both the instance of Dataloader, this parameter will be ignored. + stack_output (bool): whether stack output field like a batch, as for an output + filed of a sample is in shape [X, Y], test_data contains N samples, predict + output field will be in shape [N, X, Y] if stack_output is True, and will + be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs + is False. stack_outputs as False is used for LoDTensor output situation, + it is recommended set as True if outputs contains no LoDTensor. Default False """ if fluid.in_dygraph_mode(): @@ -1122,19 +1127,16 @@ class Model(fluid.dygraph.Layer): if not isinstance(test_loader, Iterable): loader = test_loader() - outputs = None + outputs = [] for data in tqdm.tqdm(loader): - if not fluid.in_dygraph_mode(): - data = data[0] - - outs = self.test(*data) + data = flatten(data) + outputs.append(self.test(data[:len(self._inputs)])) - if outputs is None: - outputs = outs - else: - outputs = [ - np.vstack([x, outs[i]]) for i, x in enumerate(outputs) - ] + # NOTE: for lod tensor output, we should not stack outputs + # for stacking may loss its detail info + outputs = list(zip(*outputs)) + if stack_outputs: + outputs = [np.stack(outs, axis=0) for outs in outputs] self._test_dataloader = None if test_loader is not None and self._adapter._nranks > 1 \ diff --git a/models/__init__.py b/models/__init__.py index b2de8281de91398e808a5b09754d8c021c137dd3..02071502d382d01aaba49594b6dfcb766294ff59 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,4 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from . import resnet +from . import vgg +from . import mobilenetv1 +from . import mobilenetv2 +from . import darknet +from . import yolov3 + from .resnet import * from .mobilenetv1 import * from .mobilenetv2 import * from .vgg import * +from .darknet import * +from .yolov3 import * + +__all__ = resnet.__all__ \ + + vgg.__all__ \ + + mobilenetv1.__all__ \ + + mobilenetv2.__all__ \ + + darknet.__all__ \ + + yolov3.__all__ diff --git a/models/darknet.py b/models/darknet.py new file mode 100755 index 0000000000000000000000000000000000000000..095cf7d63c628483b3b0842f4c54d81bba75ceb6 --- /dev/null +++ b/models/darknet.py @@ -0,0 +1,204 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from paddle.fluid.dygraph.nn import Conv2D, BatchNorm + +from model import Model +from .download import get_weights_path + +__all__ = ['DarkNet53', 'ConvBNLayer', 'darknet53'] + +# {num_layers: (url, md5)} +pretrain_infos = { + 53: ('https://paddlemodels.bj.bcebos.com/hapi/darknet53.pdparams', + '2506357a5c31e865785112fc614a487d') +} + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=1, + groups=1, + padding=0, + act="leaky"): + super(ConvBNLayer, self).__init__() + + self.conv = Conv2D( + num_channels=ch_in, + num_filters=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02)), + bias_attr=False, + act=None) + self.batch_norm = BatchNorm( + num_channels=ch_out, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02), + regularizer=L2Decay(0.)), + bias_attr=ParamAttr( + initializer=fluid.initializer.Constant(0.0), + regularizer=L2Decay(0.))) + + self.act = act + + def forward(self, inputs): + out = self.conv(inputs) + out = self.batch_norm(out) + if self.act == 'leaky': + out = fluid.layers.leaky_relu(x=out, alpha=0.1) + return out + +class DownSample(fluid.dygraph.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size=3, + stride=2, + padding=1): + + super(DownSample, self).__init__() + + self.conv_bn_layer = ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=filter_size, + stride=stride, + padding=padding) + self.ch_out = ch_out + def forward(self, inputs): + out = self.conv_bn_layer(inputs) + return out + +class BasicBlock(fluid.dygraph.Layer): + def __init__(self, ch_in, ch_out): + super(BasicBlock, self).__init__() + + self.conv1 = ConvBNLayer( + ch_in=ch_in, + ch_out=ch_out, + filter_size=1, + stride=1, + padding=0) + self.conv2 = ConvBNLayer( + ch_in=ch_out, + ch_out=ch_out*2, + filter_size=3, + stride=1, + padding=1) + def forward(self, inputs): + conv1 = self.conv1(inputs) + conv2 = self.conv2(conv1) + out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None) + return out + +class LayerWarp(fluid.dygraph.Layer): + def __init__(self, ch_in, ch_out, count): + super(LayerWarp,self).__init__() + + self.basicblock0 = BasicBlock(ch_in, ch_out) + self.res_out_list = [] + for i in range(1,count): + res_out = self.add_sublayer("basic_block_%d" % (i), + BasicBlock( + ch_out*2, + ch_out)) + self.res_out_list.append(res_out) + self.ch_out = ch_out + def forward(self,inputs): + y = self.basicblock0(inputs) + for basic_block_i in self.res_out_list: + y = basic_block_i(y) + return y + + +DarkNet_cfg = {53: ([1, 2, 8, 8, 4])} + + +class DarkNet53(Model): + def __init__(self, num_layers=53, ch_in=3): + super(DarkNet53, self).__init__() + assert num_layers in DarkNet_cfg.keys(), \ + "only support num_layers in {} currently" \ + .format(DarkNet_cfg.keys()) + self.stages = DarkNet_cfg[num_layers] + self.stages = self.stages[0:5] + + self.conv0 = ConvBNLayer( + ch_in=ch_in, + ch_out=32, + filter_size=3, + stride=1, + padding=1) + + self.downsample0 = DownSample( + ch_in=32, + ch_out=32 * 2) + self.darknet53_conv_block_list = [] + self.downsample_list = [] + ch_in = [64,128,256,512,1024] + for i, stage in enumerate(self.stages): + conv_block = self.add_sublayer( + "stage_%d" % (i), + LayerWarp( + int(ch_in[i]), + 32*(2**i), + stage)) + self.darknet53_conv_block_list.append(conv_block) + for i in range(len(self.stages) - 1): + downsample = self.add_sublayer( + "stage_%d_downsample" % i, + DownSample( + ch_in = 32*(2**(i+1)), + ch_out = 32*(2**(i+2)))) + self.downsample_list.append(downsample) + + def forward(self,inputs): + + out = self.conv0(inputs) + out = self.downsample0(out) + blocks = [] + for i, conv_block_i in enumerate(self.darknet53_conv_block_list): + out = conv_block_i(out) + blocks.append(out) + if i < len(self.stages) - 1: + out = self.downsample_list[i](out) + return blocks[-1:-4:-1] + + +def _darknet(num_layers=53, input_channels=3, pretrained=True): + model = DarkNet53(num_layers, input_channels) + if pretrained: + assert num_layers in pretrain_infos.keys(), \ + "DarkNet{} do not have pretrained weights now, " \ + "pretrained should be set as False".format(num_layers) + weight_path = get_weights_path(*(pretrain_infos[num_layers])) + assert weight_path.endswith('.pdparams'), \ + "suffix of weight must be .pdparams" + model.load(weight_path[:-9]) + return model + + +def darknet53(input_channels=3, pretrained=True): + return _darknet(53, input_channels, pretrained) diff --git a/models/yolov3.py b/models/yolov3.py new file mode 100644 index 0000000000000000000000000000000000000000..c2bbc88ee27cb08269bd2a986ff7b55b4f199999 --- /dev/null +++ b/models/yolov3.py @@ -0,0 +1,250 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from model import Model, Loss +from .darknet import darknet53, ConvBNLayer +from .download import get_weights_path + +__all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53'] + +# {num_layers: (url, md5)} +pretrain_infos = { + 53: ('https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams', + 'aed7dd45124ff2e844ae3bd5ba6c91d2') +} + + +class YoloDetectionBlock(fluid.dygraph.Layer): + def __init__(self, ch_in, channel): + super(YoloDetectionBlock, self).__init__() + + assert channel % 2 == 0, \ + "channel {} cannot be divided by 2".format(channel) + + self.conv0 = ConvBNLayer( + ch_in=ch_in, + ch_out=channel, + filter_size=1, + stride=1, + padding=0) + self.conv1 = ConvBNLayer( + ch_in=channel, + ch_out=channel*2, + filter_size=3, + stride=1, + padding=1) + self.conv2 = ConvBNLayer( + ch_in=channel*2, + ch_out=channel, + filter_size=1, + stride=1, + padding=0) + self.conv3 = ConvBNLayer( + ch_in=channel, + ch_out=channel*2, + filter_size=3, + stride=1, + padding=1) + self.route = ConvBNLayer( + ch_in=channel*2, + ch_out=channel, + filter_size=1, + stride=1, + padding=0) + self.tip = ConvBNLayer( + ch_in=channel, + ch_out=channel*2, + filter_size=3, + stride=1, + padding=1) + + def forward(self, inputs): + out = self.conv0(inputs) + out = self.conv1(out) + out = self.conv2(out) + out = self.conv3(out) + route = self.route(out) + tip = self.tip(route) + return route, tip + + +class YOLOv3(Model): + def __init__(self, num_classes=80, model_mode='train'): + super(YOLOv3, self).__init__() + self.num_classes = num_classes + assert str.lower(model_mode) in ['train', 'eval', 'test'], \ + "model_mode should be 'train' 'eval' or 'test', but got " \ + "{}".format(model_mode) + self.model_mode = str.lower(model_mode) + self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, + 59, 119, 116, 90, 156, 198, 373, 326] + self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + self.valid_thresh = 0.005 + self.nms_thresh = 0.45 + self.nms_topk = 400 + self.nms_posk = 100 + self.draw_thresh = 0.5 + + self.backbone = darknet53(pretrained=(model_mode=='train')) + self.block_outputs = [] + self.yolo_blocks = [] + self.route_blocks = [] + + for idx, num_chan in enumerate([1024, 768, 384]): + yolo_block = self.add_sublayer( + "yolo_detecton_block_{}".format(idx), + YoloDetectionBlock(num_chan, 512 // (2**idx))) + self.yolo_blocks.append(yolo_block) + + num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5) + + block_out = self.add_sublayer( + "block_out_{}".format(idx), + Conv2D(num_channels=1024 // (2**idx), + num_filters=num_filters, + filter_size=1, + act=None, + param_attr=ParamAttr( + initializer=fluid.initializer.Normal(0., 0.02)), + bias_attr=ParamAttr( + initializer=fluid.initializer.Constant(0.0), + regularizer=L2Decay(0.)))) + self.block_outputs.append(block_out) + if idx < 2: + route = self.add_sublayer( + "route2_{}".format(idx), + ConvBNLayer(ch_in=512 // (2**idx), + ch_out=256 // (2**idx), + filter_size=1, + act='leaky_relu')) + self.route_blocks.append(route) + + def forward(self, img_info, inputs): + outputs = [] + boxes = [] + scores = [] + downsample = 32 + + feats = self.backbone(inputs) + route = None + for idx, feat in enumerate(feats): + if idx > 0: + feat = fluid.layers.concat(input=[route, feat], axis=1) + route, tip = self.yolo_blocks[idx](feat) + block_out = self.block_outputs[idx](tip) + outputs.append(block_out) + + if idx < 2: + route = self.route_blocks[idx](route) + route = fluid.layers.resize_nearest(route, scale=2) + + if self.model_mode != 'train': + anchor_mask = self.anchor_masks[idx] + mask_anchors = [] + for m in anchor_mask: + mask_anchors.append(self.anchors[2 * m]) + mask_anchors.append(self.anchors[2 * m + 1]) + img_shape = fluid.layers.slice(img_info, axes=[1], starts=[1], ends=[3]) + img_id = fluid.layers.slice(img_info, axes=[1], starts=[0], ends=[1]) + b, s = fluid.layers.yolo_box( + x=block_out, + img_size=img_shape, + anchors=mask_anchors, + class_num=self.num_classes, + conf_thresh=self.valid_thresh, + downsample_ratio=downsample) + + boxes.append(b) + scores.append(fluid.layers.transpose(s, perm=[0, 2, 1])) + + downsample //= 2 + + if self.model_mode == 'train': + return outputs + + preds = [img_id[0, :], + fluid.layers.multiclass_nms( + bboxes=fluid.layers.concat(boxes, axis=1), + scores=fluid.layers.concat(scores, axis=2), + score_threshold=self.valid_thresh, + nms_top_k=self.nms_topk, + keep_top_k=self.nms_posk, + nms_threshold=self.nms_thresh, + background_label=-1)] + + if self.model_mode == 'test': + return preds + + # model_mode == "eval" + return outputs + preds + +class YoloLoss(Loss): + def __init__(self, num_classes=80, num_max_boxes=50): + super(YoloLoss, self).__init__() + self.num_classes = num_classes + self.num_max_boxes = num_max_boxes + self.ignore_thresh = 0.7 + self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, + 59, 119, 116, 90, 156, 198, 373, 326] + self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + + def forward(self, outputs, labels): + downsample = 32 + gt_box, gt_label, gt_score = labels + losses = [] + + for idx, out in enumerate(outputs): + if idx == 3: break # debug + anchor_mask = self.anchor_masks[idx] + loss = fluid.layers.yolov3_loss( + x=out, + gt_box=gt_box, + gt_label=gt_label, + gt_score=gt_score, + anchor_mask=anchor_mask, + downsample_ratio=downsample, + anchors=self.anchors, + class_num=self.num_classes, + ignore_thresh=self.ignore_thresh, + use_label_smooth=True) + loss = fluid.layers.reduce_mean(loss) + losses.append(loss) + downsample //= 2 + return losses + + +def _yolov3_darknet(num_layers=53, num_classes=80, + model_mode='train', pretrained=True): + model = YOLOv3(num_classes, model_mode) + if pretrained: + assert num_layers in pretrain_infos.keys(), \ + "YOLOv3-DarkNet{} do not have pretrained weights now, " \ + "pretrained should be set as False".format(num_layers) + weight_path = get_weights_path(*(pretrain_infos[num_layers])) + assert weight_path.endswith('.pdparams'), \ + "suffix of weight must be .pdparams" + model.load(weight_path[:-9]) + return model + + +def yolov3_darknet53(num_classes=80, model_mode='train', pretrained=True): + return _yolov3_darknet(53, num_classes, model_mode, pretrained) diff --git a/text.py b/text.py new file mode 100644 index 0000000000000000000000000000000000000000..2702981d6e274ab20e250b7499b65632ffd7a3ba --- /dev/null +++ b/text.py @@ -0,0 +1,992 @@ +import collections +import copy +import six +import sys +from functools import partial, reduce + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers.utils as utils +from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as +from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm +from paddle.fluid.data_feeder import convert_dtype + +from paddle.fluid import layers +from paddle.fluid.dygraph import Layer +from paddle.fluid.layers import BeamSearchDecoder + +__all__ = [ + 'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode', + 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', + 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', + 'TransformerDecoder', 'TransformerBeamSearchDecoder' +] + + +class RNNCell(Layer): + def get_initial_states(self, + batch_ref, + shape=None, + dtype=None, + init_value=0, + batch_dim_idx=0): + """ + Generate initialized states according to provided shape, data type and + value. + + Parameters: + batch_ref: A (possibly nested structure of) tensor variable[s]. + The first dimension of the tensor will be used as batch size to + initialize states. + shape: A (possiblely nested structure of) shape[s], where a shape is + represented as a list/tuple of integer). -1(for batch size) will + beautomatically inserted if shape is not started with it. If None, + property `state_shape` will be used. The default value is None. + dtype: A (possiblely nested structure of) data type[s]. The structure + must be same as that of `shape`, except when all tensors' in states + has the same data type, a single data type can be used. If None and + property `cell.state_shape` is not available, float32 will be used + as the data type. The default value is None. + init_value: A float value used to initialize states. + + Returns: + Variable: tensor variable[s] packed in the same structure provided \ + by shape, representing the initialized states. + """ + # TODO: use inputs and batch_size + batch_ref = flatten(batch_ref)[0] + + def _is_shape_sequence(seq): + if sys.version_info < (3, ): + integer_types = ( + int, + long, ) + else: + integer_types = (int, ) + """For shape, list/tuple of integer is the finest-grained objection""" + if (isinstance(seq, list) or isinstance(seq, tuple)): + if reduce( + lambda flag, x: isinstance(x, integer_types) and flag, + seq, True): + return False + # TODO: Add check for the illegal + if isinstance(seq, dict): + return True + return (isinstance(seq, collections.Sequence) and + not isinstance(seq, six.string_types)) + + class Shape(object): + def __init__(self, shape): + self.shape = shape if shape[0] == -1 else ([-1] + list(shape)) + + # nested structure of shapes + states_shapes = self.state_shape if shape is None else shape + is_sequence_ori = utils.is_sequence + utils.is_sequence = _is_shape_sequence + states_shapes = map_structure(lambda shape: Shape(shape), + states_shapes) + utils.is_sequence = is_sequence_ori + + # nested structure of dtypes + try: + states_dtypes = self.state_dtype if dtype is None else dtype + except NotImplementedError: # use fp32 as default + states_dtypes = "float32" + if len(flatten(states_dtypes)) == 1: + dtype = flatten(states_dtypes)[0] + states_dtypes = map_structure(lambda shape: dtype, states_shapes) + + init_states = map_structure( + lambda shape, dtype: fluid.layers.fill_constant_batch_size_like( + input=batch_ref, + shape=shape.shape, + dtype=dtype, + value=init_value, + input_dim_idx=batch_dim_idx), states_shapes, states_dtypes) + return init_states + + @property + def state_shape(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) shape[s], where a shape is represented + as a list/tuple of integers (-1 for batch size would be automatically + inserted into a shape if shape is not started with it). + Not necessary to be implemented if states are not initialized by + `get_initial_states` or the `shape` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_shape` in the used cell.") + + @property + def state_dtype(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) data types[s]. The structure must be + same as that of `shape`, except when all tensors' in states has the same + data type, a signle data type can be used. + Not necessary to be implemented if states are not initialized + by `get_initial_states` or the `dtype` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_dtype` in the used cell.") + + +class BasicLSTMCell(RNNCell): + """ + **** + BasicLSTMUnit class, Using basic operator to build LSTM + The algorithm can be described as the code below. + .. math:: + i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) + f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) + o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) + c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + h_t &= o_t \odot tanh(c_t) + - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The :math:`\odot` is the element-wise product of the vectors. + - :math:`tanh` is the activation functions. + - :math:`\\tilde{c_t}` is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + Args: + name_scope(string) : The name scope used to identify parameter and bias name + hidden_size (integer): The hidden size used in the Unit. + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight matrix. Note: + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The parameter attribute for the bias + of LSTM unit. + If it is set to None or one attribute of ParamAttr, lstm_unit will + create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized as zero. Default: None. + gate_activation (function|None): The activation function for gates (actGate). + Default: 'fluid.layers.sigmoid' + activation (function|None): The activation function for cells (actNode). + Default: 'fluid.layers.tanh' + forget_bias(float|1.0): forget bias used when computing forget gate + dtype(string): data type used in this unit + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + forget_bias=1.0, + dtype='float32'): + super(BasicLSTMCell, self).__init__() + + self._hidden_size = hidden_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._forget_bias = layers.fill_constant( + [1], dtype=dtype, value=forget_bias) + self._forget_bias.stop_gradient = False + self._dtype = dtype + self._input_size = input_size + + self._weight = self.create_parameter( + attr=self._param_attr, + shape=[ + self._input_size + self._hidden_size, 4 * self._hidden_size + ], + dtype=self._dtype) + + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=[4 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + + def forward(self, input, state): + pre_hidden, pre_cell = state + concat_input_hidden = layers.concat([input, pre_hidden], 1) + gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) + + gate_input = layers.elementwise_add(gate_input, self._bias) + i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) + new_cell = layers.elementwise_add( + layers.elementwise_mul( + pre_cell, + layers.sigmoid(layers.elementwise_add(f, self._forget_bias))), + layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j))) + new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) + + return new_hidden, [new_hidden, new_cell] + + @property + def state_shape(self): + return [[self._hidden_size], [self._hidden_size]] + + +class BasicGRUCell(RNNCell): + """ + **** + BasicGRUUnit class, using basic operators to build GRU + The algorithm can be described as the equations below. + + .. math:: + u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u) + + r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r) + + m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m) + + h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) + + Args: + hidden_size (integer): The hidden size used in the Unit. + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight matrix. Note: + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The parameter attribute for the bias + of GRU unit. + If it is set to None or one attribute of ParamAttr, gru_unit will + create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + gate_activation (function|None): The activation function for gates (actGate). + Default: 'fluid.layers.sigmoid' + activation (function|None): The activation function for cell (actNode). + Default: 'fluid.layers.tanh' + dtype(string): data type used in this unit + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + dtype='float32'): + super(BasicGRUCell, self).__init__() + self._input_size = input_size + self._hiden_size = hidden_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._dtype = dtype + + if self._param_attr is not None and self._param_attr.name is not None: + gate_param_attr = copy.deepcopy(self._param_attr) + candidate_param_attr = copy.deepcopy(self._param_attr) + gate_param_attr.name += "_gate" + candidate_param_attr.name += "_candidate" + else: + gate_param_attr = self._param_attr + candidate_param_attr = self._param_attr + + self._gate_weight = self.create_parameter( + attr=gate_param_attr, + shape=[self._input_size + self._hiden_size, 2 * self._hiden_size], + dtype=self._dtype) + + self._candidate_weight = self.create_parameter( + attr=candidate_param_attr, + shape=[self._input_size + self._hiden_size, self._hiden_size], + dtype=self._dtype) + + if self._bias_attr is not None and self._bias_attr.name is not None: + gate_bias_attr = copy.deepcopy(self._bias_attr) + candidate_bias_attr = copy.deepcopy(self._bias_attr) + gate_bias_attr.name += "_gate" + candidate_bias_attr.name += "_candidate" + else: + gate_bias_attr = self._bias_attr + candidate_bias_attr = self._bias_attr + + self._gate_bias = self.create_parameter( + attr=gate_bias_attr, + shape=[2 * self._hiden_size], + dtype=self._dtype, + is_bias=True) + self._candidate_bias = self.create_parameter( + attr=candidate_bias_attr, + shape=[self._hiden_size], + dtype=self._dtype, + is_bias=True) + + def forward(self, input, state): + pre_hidden = state + concat_input_hidden = layers.concat([input, pre_hidden], axis=1) + + gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) + + gate_input = layers.elementwise_add(gate_input, self._gate_bias) + + gate_input = self._gate_activation(gate_input) + r, u = layers.split(gate_input, num_or_sections=2, dim=1) + + r_hidden = r * pre_hidden + + candidate = layers.matmul( + layers.concat([input, r_hidden], 1), self._candidate_weight) + candidate = layers.elementwise_add(candidate, self._candidate_bias) + + c = self._activation(candidate) + new_hidden = u * pre_hidden + (1 - u) * c + + return new_hidden + + @property + def state_shape(self): + return [self._hidden_size] + + +class RNN(fluid.dygraph.Layer): + def __init__(self, cell, is_reverse=False, time_major=False): + super(RNN, self).__init__() + self.cell = cell + if not hasattr(self.cell, "call"): + self.cell.call = self.cell.forward + self.is_reverse = is_reverse + self.time_major = time_major + self.batch_index, self.time_step_index = (1, 0) if time_major else (0, + 1) + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): + if fluid.in_dygraph_mode(): + + class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + new_state = fluid.layers.elementwise_mul( + new_state, step_mask, + axis=0) - fluid.layers.elementwise_mul( + state, (step_mask - 1), axis=0) + return new_state + + flat_inputs = flatten(inputs) + batch_size, time_steps = ( + flat_inputs[0].shape[self.batch_index], + flat_inputs[0].shape[self.time_step_index]) + + if initial_states is None: + initial_states = self.cell.get_initial_states( + batch_ref=inputs, batch_dim_idx=self.batch_index) + + if not self.time_major: + inputs = map_structure( + lambda x: fluid.layers.transpose(x, [1, 0] + list( + range(2, len(x.shape)))), inputs) + + if sequence_length: + mask = fluid.layers.sequence_mask( + sequence_length, + maxlen=time_steps, + dtype=flatten(initial_states)[0].dtype) + mask = fluid.layers.transpose(mask, [1, 0]) + + if self.is_reverse: + inputs = map_structure( + lambda x: fluid.layers.reverse(x, axis=[0]), inputs) + mask = fluid.layers.reverse( + mask, axis=[0]) if sequence_length else None + + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = self.cell(step_inputs, states, + **kwargs) + if sequence_length: + new_states = map_structure( + partial( + _maybe_copy, step_mask=mask[i]), + states, + new_states) + states = new_states + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if i == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, + outputs) + + final_outputs = map_structure( + lambda x: fluid.layers.stack(x.array, + axis=self.time_step_index), + outputs) + + if self.is_reverse: + final_outputs = map_structure( + lambda x: fluid.layers.reverse(x, + axis=self.time_step_index), + final_outputs) + + final_states = new_states + else: + final_outputs, final_states = fluid.layers.rnn( + self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse, + **kwargs) + return final_outputs, final_states + + +class DynamicDecode(Layer): + def __init__(self, + decoder, + max_step_num=None, + output_time_major=False, + impute_finished=False, + is_test=False, + return_length=False): + super(DynamicDecode, self).__init__() + self.decoder = decoder + self.max_step_num = max_step_num + self.output_time_major = output_time_major + self.impute_finished = impute_finished + self.is_test = is_test + self.return_length = return_length + + def forward(self, inits=None, **kwargs): + if fluid.in_dygraph_mode(): + + class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + def __getitem__(self, item): + return self.array.__getitem__(item) + + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + state_dtype = state.dtype + if convert_dtype(state_dtype) in ["bool"]: + state = layers.cast(state, dtype="float32") + new_state = layers.cast(new_state, dtype="float32") + if step_mask.dtype != state.dtype: + step_mask = layers.cast(step_mask, dtype=state.dtype) + # otherwise, renamed bool gradients of would be summed up leading + # to sum(bool) error. + step_mask.stop_gradient = True + new_state = layers.elementwise_mul( + state, step_mask, axis=0) - layers.elementwise_mul( + new_state, (step_mask - 1), axis=0) + if convert_dtype(state_dtype) in ["bool"]: + new_state = layers.cast(new_state, dtype=state_dtype) + return new_state + + initial_inputs, initial_states, initial_finished = self.decoder.initialize( + inits) + inputs, states, finished = (initial_inputs, initial_states, + initial_finished) + cond = layers.logical_not((layers.reduce_all(initial_finished))) + sequence_lengths = layers.cast( + layers.zeros_like(initial_finished), "int64") + outputs = None + + step_idx = 0 + step_idx_tensor = layers.fill_constant( + shape=[1], dtype="int64", value=step_idx) + while cond.numpy(): + (step_outputs, next_states, next_inputs, + next_finished) = self.decoder.step(step_idx_tensor, inputs, + states, **kwargs) + next_finished = layers.logical_or(next_finished, finished) + next_sequence_lengths = layers.elementwise_add( + sequence_lengths, + layers.cast( + layers.logical_not(finished), sequence_lengths.dtype)) + + if self.impute_finished: # rectify the states for the finished. + next_states = map_structure( + lambda x, y: _maybe_copy(x, y, finished), states, + next_states) + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if step_idx == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, + outputs) + inputs, states, finished, sequence_lengths = ( + next_inputs, next_states, next_finished, + next_sequence_lengths) + + layers.increment(x=step_idx_tensor, value=1.0, in_place=True) + step_idx += 1 + + layers.logical_not(layers.reduce_all(finished), cond) + if self.max_step_num is not None and step_idx > self.max_step_num: + break + + final_outputs = map_structure( + lambda x: fluid.layers.stack(x.array, axis=0), outputs) + final_states = states + + try: + final_outputs, final_states = self.decoder.finalize( + final_outputs, final_states, sequence_lengths) + except NotImplementedError: + pass + + if not self.output_time_major: + final_outputs = map_structure( + lambda x: layers.transpose(x, [1, 0] + list( + range(2, len(x.shape)))), final_outputs) + + return (final_outputs, final_states, + sequence_lengths) if self.return_length else ( + final_outputs, final_states) + else: + return fluid.layers.dynamic_decode( + self.decoder, + inits, + max_step_num=self.max_step_num, + output_time_major=self.output_time_major, + impute_finished=self.impute_finished, + is_test=self.is_test, + return_length=self.return_length, + **kwargs) + + +class TransfomerCell(object): + """ + Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be + used as RNNCell + """ + + def __init__(self, decoder): + self.decoder = decoder + + def __call__(self, inputs, states, trg_src_attn_bias, enc_output, + static_caches): + trg_word, trg_pos = inputs + for cache, static_cache in zip(states, static_caches): + cache.update(static_cache) + logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, + enc_output, states) + new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states] + return logits, new_states + + +class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): + def __init__(self, cell, start_token, end_token, beam_size, + var_dim_in_state): + super(TransformerBeamSearchDecoder, + self).__init__(cell, start_token, end_token, beam_size) + self.cell = cell + self.var_dim_in_state = var_dim_in_state + + def _merge_batch_beams_with_var_dim(self, x): + # init length of cache is 0, and it increases with decoding carrying on, + # thus need to reshape elaborately + var_dim_in_state = self.var_dim_in_state + 1 # count in beam dim + x = layers.transpose(x, + list(range(var_dim_in_state, len(x.shape))) + + list(range(0, var_dim_in_state))) + x = layers.reshape( + x, [0] * (len(x.shape) - var_dim_in_state + ) + [self.batch_size * self.beam_size] + + [int(size) for size in x.shape[-var_dim_in_state + 2:]]) + x = layers.transpose( + x, + list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) + + list(range(0, (len(x.shape) + 1 - var_dim_in_state)))) + return x + + def _split_batch_beams_with_var_dim(self, x): + var_dim_size = layers.shape(x)[self.var_dim_in_state] + x = layers.reshape( + x, [-1, self.beam_size] + + [int(size) + for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + + [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) + return x + + def step(self, time, inputs, states, **kwargs): + # compared to RNN, Transformer has 3D data at every decoding step + inputs = layers.reshape(inputs, [-1, 1]) # token + pos = layers.ones_like(inputs) * time # pos + cell_states = map_structure(self._merge_batch_beams_with_var_dim, + states.cell_states) + + cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states, + **kwargs) + cell_outputs = map_structure(self._split_batch_beams, cell_outputs) + next_cell_states = map_structure(self._split_batch_beams_with_var_dim, + next_cell_states) + + beam_search_output, beam_search_state = self._beam_search_step( + time=time, + logits=cell_outputs, + next_cell_states=next_cell_states, + beam_state=states) + next_inputs, finished = (beam_search_output.predicted_ids, + beam_search_state.finished) + + return (beam_search_output, beam_search_state, next_inputs, finished) + + +### Transformer Modules ### +class PrePostProcessLayer(Layer): + """ + PrePostProcessLayer + """ + + def __init__(self, process_cmd, d_model, dropout_rate): + super(PrePostProcessLayer, self).__init__() + self.process_cmd = process_cmd + self.functors = [] + for cmd in self.process_cmd: + if cmd == "a": # add residual connection + self.functors.append(lambda x, y: x + y if y else x) + elif cmd == "n": # add layer normalization + self.functors.append( + self.add_sublayer( + "layer_norm_%d" % len( + self.sublayers(include_sublayers=False)), + LayerNorm( + normalized_shape=d_model, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.))))) + elif cmd == "d": # add dropout + self.functors.append(lambda x: layers.dropout( + x, dropout_prob=dropout_rate, is_test=False) + if dropout_rate else x) + + def forward(self, x, residual=None): + for i, cmd in enumerate(self.process_cmd): + if cmd == "a": + x = self.functors[i](x, residual) + else: + x = self.functors[i](x) + return x + + +class MultiHeadAttention(Layer): + """ + Multi-Head Attention + """ + + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.d_model = d_model + self.dropout_rate = dropout_rate + self.q_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.k_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.v_fc = Linear( + input_dim=d_model, output_dim=d_value * n_head, bias_attr=False) + self.proj_fc = Linear( + input_dim=d_value * n_head, output_dim=d_model, bias_attr=False) + + def _prepare_qkv(self, queries, keys, values, cache=None): + if keys is None: # self-attention + keys, values = queries, queries + static_kv = False + else: # cross-attention + static_kv = True + + q = self.q_fc(queries) + q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = layers.transpose(x=q, perm=[0, 2, 1, 3]) + + if cache is not None and static_kv and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k = cache["static_k"] + v = cache["static_v"] + else: + k = self.k_fc(keys) + v = self.v_fc(values) + k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + + if cache is not None: + if static_kv and not "static_k" in cache: + # for encoder-decoder attention in inference and has not cached + cache["static_k"], cache["static_v"] = k, v + elif not static_kv: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = layers.concat([cache_k, k], axis=2) + v = layers.concat([cache_v, v], axis=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def forward(self, queries, keys, values, attn_bias, cache=None): + # compute q ,k ,v + q, k, v = self._prepare_qkv(queries, keys, values, cache) + + # scale dot product attention + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if self.dropout_rate: + weights = layers.dropout( + weights, dropout_prob=self.dropout_rate, is_test=False) + + out = layers.matmul(weights, v) + + # combine heads + out = layers.transpose(out, perm=[0, 2, 1, 3]) + out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.proj_fc(out) + return out + + def cal_kv(self, keys, values): + k = self.k_fc(keys) + v = self.v_fc(values) + k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + return k, v + + +class FFN(Layer): + """ + Feed-Forward Network + """ + + def __init__(self, d_inner_hid, d_model, dropout_rate): + super(FFN, self).__init__() + self.dropout_rate = dropout_rate + self.fc1 = Linear( + input_dim=d_model, output_dim=d_inner_hid, act="relu") + self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) + + def forward(self, x): + hidden = self.fc1(x) + if self.dropout_rate: + hidden = layers.dropout( + hidden, dropout_prob=self.dropout_rate, is_test=False) + out = self.fc2(hidden) + return out + + +class TransformerEncoderLayer(Layer): + """ + EncoderLayer + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(TransformerEncoderLayer, self).__init__() + + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class TransformerEncoder(Layer): + """ + encoder + """ + + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(TransformerEncoder, self).__init__() + + self.encoder_layers = list() + for i in range(n_layer): + self.encoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + TransformerEncoderLayer( + n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + for encoder_layer in self.encoder_layers: + enc_output = encoder_layer(enc_input, attn_bias) + enc_input = enc_output + + return self.processer(enc_output) + + +class TransformerDecoderLayer(Layer): + """ + decoder + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + super(TransformerDecoderLayer, self).__init__() + + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, + dec_input, + enc_output, + self_attn_bias, + cross_attn_bias, + cache=None): + self_attn_output = self.self_attn( + self.preprocesser1(dec_input), None, None, self_attn_bias, cache) + self_attn_output = self.postprocesser1(self_attn_output, dec_input) + + cross_attn_output = self.cross_attn( + self.preprocesser2(self_attn_output), enc_output, enc_output, + cross_attn_bias, cache) + cross_attn_output = self.postprocesser2(cross_attn_output, + self_attn_output) + + ffn_output = self.ffn(self.preprocesser3(cross_attn_output)) + ffn_output = self.postprocesser3(ffn_output, cross_attn_output) + + return ffn_output + + +class TransformerDecoder(Layer): + """ + decoder + """ + + def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd): + super(TransformerDecoder, self).__init__() + + self.decoder_layers = list() + for i in range(n_layer): + self.decoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + TransformerDecoderLayer( + n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, + dec_input, + enc_output, + self_attn_bias, + cross_attn_bias, + caches=None): + for i, decoder_layer in enumerate(self.decoder_layers): + dec_output = decoder_layer(dec_input, enc_output, self_attn_bias, + cross_attn_bias, None + if caches is None else caches[i]) + dec_input = dec_output + + return self.processer(dec_output) + + def prepare_static_cache(self, enc_output): + return [ + dict( + zip(("static_k", "static_v"), + decoder_layer.cross_attn.cal_kv(enc_output, enc_output))) + for decoder_layer in self.decoder_layers + ] diff --git a/transformer/README.md b/transformer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b03ca1f09ef641bd590ca4833f7cb50d81fabd2 --- /dev/null +++ b/transformer/README.md @@ -0,0 +1,289 @@ +## Transformer + +以下是本例的简要目录结构及说明: + +```text +. +├── images # README 文档中的图片 +├── utils # 工具包 +├── gen_data.sh # 数据生成脚本 +├── predict.py # 预测脚本 +├── reader.py # 数据读取接口 +├── README.md # 文档 +├── train.py # 训练脚本 +├── model.py # 模型定义文件 +└── transformer.yaml # 配置文件 +``` + +## 模型简介 + +机器翻译(machine translation, MT)是利用计算机将一种自然语言(源语言)转换为另一种自然语言(目标语言)的过程,输入为源语言句子,输出为相应的目标语言的句子。 + +本项目是机器翻译领域主流模型 Transformer 的 PaddlePaddle 实现, 包含模型训练,预测以及使用自定义数据等内容。用户可以基于发布的内容搭建自己的翻译模型。 + + +## 快速开始 + +### 安装说明 + +1. paddle安装 + + 本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 + +2. 下载代码 + + 克隆代码库到本地 + ```shell + git clone https://github.com/PaddlePaddle/models.git + cd models/dygraph/transformer + ``` + +3. 环境依赖 + + 请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)部分的内容 + + +### 数据准备 + +公开数据集:WMT 翻译大赛是机器翻译领域最具权威的国际评测大赛,其中英德翻译任务提供了一个中等规模的数据集,这个数据集是较多论文中使用的数据集,也是 Transformer 论文中用到的一个数据集。我们也将[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)作为示例提供。运行 `gen_data.sh` 脚本进行 WMT'16 EN-DE 数据集的下载和预处理(时间较长,建议后台运行)。数据处理过程主要包括 Tokenize 和 [BPE 编码(byte-pair encoding)](https://arxiv.org/pdf/1508.07909)。运行成功后,将会生成文件夹 `gen_data`,其目录结构如下: + +```text +. +├── wmt16_ende_data # WMT16 英德翻译数据 +├── wmt16_ende_data_bpe # BPE 编码的 WMT16 英德翻译数据 +├── mosesdecoder # Moses 机器翻译工具集,包含了 Tokenize、BLEU 评估等脚本 +└── subword-nmt # BPE 编码的代码 +``` + +另外我们也整理提供了一份处理好的 WMT'16 EN-DE 数据以供[下载](https://transformer-res.bj.bcebos.com/wmt16_ende_data_bpe_clean.tar.gz)使用,其中包含词典(`vocab_all.bpe.32000`文件)、训练所需的 BPE 数据(`train.tok.clean.bpe.32000.en-de`文件)、预测所需的 BPE 数据(`newstest2016.tok.bpe.32000.en-de`等文件)和相应的评估预测结果所需的 tokenize 数据(`newstest2016.tok.de`等文件)。 + + +自定义数据:如果需要使用自定义数据,本项目程序中可直接支持的数据格式为制表符 \t 分隔的源语言和目标语言句子对,句子中的 token 之间使用空格分隔。提供以上格式的数据文件(可以分多个part,数据读取支持文件通配符)和相应的词典文件即可直接运行。 + +### 单机训练 + +### 单机单卡 + +以提供的英德翻译数据为例,可以执行以下命令进行模型训练: + +```sh +# setting visible devices for training +export CUDA_VISIBLE_DEVICES=0 + +python -u train.py \ + --epoch 30 \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \ + --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 4096 +``` + +以上命令中传入了训练轮数(`epoch`)和训练数据文件路径(注意请正确设置,支持通配符)等参数,更多参数的使用以及支持的模型超参数可以参见 `transformer.yaml` 配置文件,其中默认提供了 Transformer base model 的配置,如需调整可以在配置文件中更改或通过命令行传入(命令行传入内容将覆盖配置文件中的设置)。可以通过以下命令来训练 Transformer 论文中的 big model: + +```sh +# setting visible devices for training +export CUDA_VISIBLE_DEVICES=0 + +python -u train.py \ + --epoch 30 \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \ + --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 4096 \ + --n_head 16 \ + --d_model 1024 \ + --d_inner_hid 4096 \ + --prepostprocess_dropout 0.3 +``` + +另外,如果在执行训练时若提供了 `save_model`(默认为 trained_models),则每隔一定 iteration 后(通过参数 `save_step` 设置,默认为10000)将保存当前训练的到相应目录(会保存分别记录了模型参数和优化器状态的 `transformer.pdparams` 和 `transformer.pdopt` 两个文件),每隔一定数目的 iteration (通过参数 `print_step` 设置,默认为100)将打印如下的日志到标准输出: + +```txt +[2019-08-02 15:30:51,656 INFO train.py:262] step_idx: 150100, epoch: 32, batch: 1364, avg loss: 2.880427, normalized loss: 1.504687, ppl: 17.821888, speed: 3.34 step/s +[2019-08-02 15:31:19,824 INFO train.py:262] step_idx: 150200, epoch: 32, batch: 1464, avg loss: 2.955965, normalized loss: 1.580225, ppl: 19.220257, speed: 3.55 step/s +[2019-08-02 15:31:48,151 INFO train.py:262] step_idx: 150300, epoch: 32, batch: 1564, avg loss: 2.951180, normalized loss: 1.575439, ppl: 19.128502, speed: 3.53 step/s +[2019-08-02 15:32:16,401 INFO train.py:262] step_idx: 150400, epoch: 32, batch: 1664, avg loss: 3.027281, normalized loss: 1.651540, ppl: 20.641024, speed: 3.54 step/s +[2019-08-02 15:32:44,764 INFO train.py:262] step_idx: 150500, epoch: 32, batch: 1764, avg loss: 3.069125, normalized loss: 1.693385, ppl: 21.523066, speed: 3.53 step/s +[2019-08-02 15:33:13,199 INFO train.py:262] step_idx: 150600, epoch: 32, batch: 1864, avg loss: 2.869379, normalized loss: 1.493639, ppl: 17.626074, speed: 3.52 step/s +[2019-08-02 15:33:41,601 INFO train.py:262] step_idx: 150700, epoch: 32, batch: 1964, avg loss: 2.980905, normalized loss: 1.605164, ppl: 19.705633, speed: 3.52 step/s +[2019-08-02 15:34:10,079 INFO train.py:262] step_idx: 150800, epoch: 32, batch: 2064, avg loss: 3.047716, normalized loss: 1.671976, ppl: 21.067181, speed: 3.51 step/s +[2019-08-02 15:34:38,598 INFO train.py:262] step_idx: 150900, epoch: 32, batch: 2164, avg loss: 2.956475, normalized loss: 1.580735, ppl: 19.230072, speed: 3.51 step/s +``` + +也可以使用 CPU 训练(通过参数 `--use_cuda False` 设置),训练速度较慢。 + +#### 单机多卡 + +Paddle动态图支持多进程多卡进行模型训练,启动训练的方式如下: + +```sh +python -m paddle.distributed.launch --started_port 8999 --selected_gpus=0,1,2,3,4,5,6,7 --log_dir ./mylog train.py \ + --epoch 30 \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \ + --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 4096 \ + --print_step 100 \ + --use_cuda True \ + --save_step 10000 +``` + +此时,程序会将每个进程的输出log导入到`./mylog`路径下,只有第一个工作进程会保存模型。 + +``` +. +├── mylog +│   ├── workerlog.0 +│   ├── workerlog.1 +│   ├── workerlog.2 +│   ├── workerlog.3 +│   ├── workerlog.4 +│   ├── workerlog.5 +│   ├── workerlog.6 +│   └── workerlog.7 +``` + +### 模型推断 + +以英德翻译数据为例,模型训练完成后可以执行以下命令对指定文件中的文本进行翻译: + +```sh +# setting visible devices for prediction +export CUDA_VISIBLE_DEVICES=0 + +python -u predict.py \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 32 \ + --init_from_params trained_params/step_100000 \ + --beam_size 5 \ + --max_out_len 255 \ + --output_file predict.txt +``` + + 由 `predict_file` 指定的文件中文本的翻译结果会输出到 `output_file` 指定的文件。执行预测时需要设置 `init_from_params` 来给出模型所在目录,更多参数的使用可以在 `transformer.yaml` 文件中查阅注释说明并进行更改设置。注意若在执行预测时设置了模型超参数,应与模型训练时的设置一致,如若训练时使用 big model 的参数设置,则预测时对应类似如下命令: + +```sh +# setting visible devices for prediction +export CUDA_VISIBLE_DEVICES=0 + +python -u predict.py \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 32 \ + --init_from_params trained_params/step_100000 \ + --beam_size 5 \ + --max_out_len 255 \ + --output_file predict.txt \ + --n_head 16 \ + --d_model 1024 \ + --d_inner_hid 4096 \ + --prepostprocess_dropout 0.3 +``` + + +### 模型评估 + +预测结果中每行输出是对应行输入的得分最高的翻译,对于使用 BPE 的数据,预测出的翻译结果也将是 BPE 表示的数据,要还原成原始的数据(这里指 tokenize 后的数据)才能进行正确的评估。评估过程具体如下(BLEU 是翻译任务常用的自动评估方法指标): + +```sh +# 还原 predict.txt 中的预测结果为 tokenize 后的数据 +sed -r 's/(@@ )|(@@ ?$)//g' predict.txt > predict.tok.txt +# 若无 BLEU 评估工具,需先进行下载 +# git clone https://github.com/moses-smt/mosesdecoder.git +# 以英德翻译 newstest2014 测试数据为例 +perl gen_data/mosesdecoder/scripts/generic/multi-bleu.perl gen_data/wmt16_ende_data/newstest2014.tok.de < predict.tok.txt +``` +可以看到类似如下的结果: +``` +BLEU = 26.35, 57.7/32.1/20.0/13.0 (BP=1.000, ratio=1.013, hyp_len=63903, ref_len=63078) +``` + +使用本项目中提供的内容,英德翻译 base model 和 big model 八卡训练 100K 个 iteration 后测试有大约如下的 BLEU 值: + +| 测试集 | newstest2014 | newstest2015 | newstest2016 | +|-|-|-|-| +| Base | 26.35 | 29.07 | 33.30 | +| Big | 27.07 | 30.09 | 34.38 | + +### 预训练模型 + +我们这里提供了对应有以上 BLEU 值的 [base model](https://transformer-res.bj.bcebos.com/base_model_dygraph.tar.gz) 和 [big model](https://transformer-res.bj.bcebos.com/big_model_dygraph.tar.gz) 的模型参数提供下载使用(注意,模型使用了提供下载的数据进行训练和测试)。 + +## 进阶使用 + +### 背景介绍 + +Transformer 是论文 [Attention Is All You Need](https://arxiv.org/abs/1706.03762) 中提出的用以完成机器翻译(machine translation, MT)等序列到序列(sequence to sequence, Seq2Seq)学习任务的一种全新网络结构,其完全使用注意力(Attention)机制来实现序列到序列的建模[1]。 + +相较于此前 Seq2Seq 模型中广泛使用的循环神经网络(Recurrent Neural Network, RNN),使用(Self)Attention 进行输入序列到输出序列的变换主要具有以下优势: + +- 计算复杂度小 + - 特征维度为 d 、长度为 n 的序列,在 RNN 中计算复杂度为 `O(n * d * d)` (n 个时间步,每个时间步计算 d 维的矩阵向量乘法),在 Self-Attention 中计算复杂度为 `O(n * n * d)` (n 个时间步两两计算 d 维的向量点积或其他相关度函数),n 通常要小于 d 。 +- 计算并行度高 + - RNN 中当前时间步的计算要依赖前一个时间步的计算结果;Self-Attention 中各时间步的计算只依赖输入不依赖之前时间步输出,各时间步可以完全并行。 +- 容易学习长程依赖(long-range dependencies) + - RNN 中相距为 n 的两个位置间的关联需要 n 步才能建立;Self-Attention 中任何两个位置都直接相连;路径越短信号传播越容易。 + +Transformer 中引入使用的基于 Self-Attention 的序列建模模块结构,已被广泛应用在 Bert [2]等语义表示模型中,取得了显著效果。 + + +### 模型概览 + +Transformer 同样使用了 Seq2Seq 模型中典型的编码器-解码器(Encoder-Decoder)的框架结构,整体网络结构如图1所示。 + +

+
+图 1. Transformer 网络结构图 +

+ +可以看到,和以往 Seq2Seq 模型不同,Transformer 的 Encoder 和 Decoder 中不再使用 RNN 的结构。 + +### 模型特点 + +Transformer 中的 Encoder 由若干相同的 layer 堆叠组成,每个 layer 主要由多头注意力(Multi-Head Attention)和全连接的前馈(Feed-Forward)网络这两个 sub-layer 构成。 +- Multi-Head Attention 在这里用于实现 Self-Attention,相比于简单的 Attention 机制,其将输入进行多路线性变换后分别计算 Attention 的结果,并将所有结果拼接后再次进行线性变换作为输出。参见图2,其中 Attention 使用的是点积(Dot-Product),并在点积后进行了 scale 的处理以避免因点积结果过大进入 softmax 的饱和区域。 +- Feed-Forward 网络会对序列中的每个位置进行相同的计算(Position-wise),其采用的是两次线性变换中间加以 ReLU 激活的结构。 + +此外,每个 sub-layer 后还施以 Residual Connection [3]和 Layer Normalization [4]来促进梯度传播和模型收敛。 + +

+
+图 2. Multi-Head Attention +

+ +Decoder 具有和 Encoder 类似的结构,只是相比于组成 Encoder 的 layer ,在组成 Decoder 的 layer 中还多了一个 Multi-Head Attention 的 sub-layer 来实现对 Encoder 输出的 Attention,这个 Encoder-Decoder Attention 在其他 Seq2Seq 模型中也是存在的。 + +## FAQ + +**Q:** 预测结果中样本数少于输入的样本数是什么原因 +**A:** 若样本中最大长度超过 `transformer.yaml` 中 `max_length` 的默认设置,请注意运行时增大 `--max_length` 的设置,否则超长样本将被过滤。 + +**Q:** 预测时最大长度超过了训练时的最大长度怎么办 +**A:** 由于训练时 `max_length` 的设置决定了保存模型 position encoding 的大小,若预测时长度超过 `max_length`,请调大该值,会重新生成更大的 position encoding 表。 + + +## 参考文献 +1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010. +2. Devlin J, Chang M W, Lee K, et al. [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805)[J]. arXiv preprint arXiv:1810.04805, 2018. +3. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778. +4. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016. +5. Sennrich R, Haddow B, Birch A. [Neural machine translation of rare words with subword units](https://arxiv.org/pdf/1508.07909)[J]. arXiv preprint arXiv:1508.07909, 2015. + + +## 作者 +- [guochengCS](https://github.com/guoshengCS) + +## 如何贡献代码 + +如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。 diff --git a/transformer/images/multi_head_attention.png b/transformer/images/multi_head_attention.png new file mode 100644 index 0000000000000000000000000000000000000000..427fb6b32aaeb7013066a167aab4fb97c024c2d6 Binary files /dev/null and b/transformer/images/multi_head_attention.png differ diff --git a/transformer/images/transformer_network.png b/transformer/images/transformer_network.png new file mode 100644 index 0000000000000000000000000000000000000000..34be0e5c7e2b08f858683d86353db5e81049c7ca Binary files /dev/null and b/transformer/images/transformer_network.png differ diff --git a/transformer/predict.py b/transformer/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..ec9c5629f9e5ebc7198f009d7d05df263d640281 --- /dev/null +++ b/transformer/predict.py @@ -0,0 +1,150 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import six +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from functools import partial + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.io import DataLoader +from paddle.fluid.layers.utils import flatten + +from utils.configure import PDConfig +from utils.check import check_gpu, check_version + +from model import Input, set_device +from reader import prepare_infer_input, Seq2SeqDataset, Seq2SeqBatchSampler +from transformer import InferTransformer, position_encoding_init + + +def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, + output_eos=False): + """ + Post-process the decoded sequence. + """ + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = [ + idx for idx in seq[:eos_pos + 1] + if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx) + ] + return seq + + +def do_predict(args): + device = set_device("gpu" if args.use_cuda else "cpu") + fluid.enable_dygraph(device) if args.eager_run else None + + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None, None], "int64", name="src_pos"), + Input( + [None, args.n_head, None, None], + "float32", + name="src_slf_attn_bias"), + Input( + [None, args.n_head, None, None], + "float32", + name="trg_src_attn_bias"), + ] + + # define data + dataset = Seq2SeqDataset( + fpattern=args.predict_file, + src_vocab_fpath=args.src_vocab_fpath, + trg_vocab_fpath=args.trg_vocab_fpath, + token_delimiter=args.token_delimiter, + start_mark=args.special_token[0], + end_mark=args.special_token[1], + unk_mark=args.special_token[2]) + args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ + args.unk_idx = dataset.get_vocab_summary() + trg_idx2word = Seq2SeqDataset.load_dict( + dict_path=args.trg_vocab_fpath, reverse=True) + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, + use_token_batch=False, + batch_size=args.batch_size, + max_length=args.max_length) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + feed_list=None + if fluid.in_dygraph_mode() else [x.forward() for x in inputs], + collate_fn=partial( + prepare_infer_input, src_pad_idx=args.eos_idx, n_head=args.n_head), + num_workers=0, + return_list=True) + + # define model + transformer = InferTransformer( + args.src_vocab_size, + args.trg_vocab_size, + args.max_length + 1, + args.n_layer, + args.n_head, + args.d_key, + args.d_value, + args.d_model, + args.d_inner_hid, + args.prepostprocess_dropout, + args.attention_dropout, + args.relu_dropout, + args.preprocess_cmd, + args.postprocess_cmd, + args.weight_sharing, + args.bos_idx, + args.eos_idx, + beam_size=args.beam_size, + max_out_len=args.max_out_len) + transformer.prepare(inputs=inputs) + + # load the trained model + assert args.init_from_params, ( + "Please set init_from_params to load the infer model.") + transformer.load(os.path.join(args.init_from_params, "transformer")) + + # TODO: use model.predict when support variant length + f = open(args.output_file, "wb") + for data in data_loader(): + finished_seq = transformer.test(inputs=flatten(data))[0] + finished_seq = np.transpose(finished_seq, [0, 2, 1]) + for ins in finished_seq: + for beam_idx, beam in enumerate(ins): + if beam_idx >= args.n_best: break + id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) + word_list = [trg_idx2word[id] for id in id_list] + sequence = b" ".join(word_list) + b"\n" + f.write(sequence) + + +if __name__ == "__main__": + args = PDConfig(yaml_file="./transformer.yaml") + args.build() + args.Print() + check_gpu(args.use_cuda) + check_version() + + do_predict(args) diff --git a/transformer/reader.py b/transformer/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..b83617d7179b65d9e2bc335eec1f226423a08fdb --- /dev/null +++ b/transformer/reader.py @@ -0,0 +1,423 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import six +import os +import tarfile +import itertools + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.fluid.io import BatchSampler, DataLoader, Dataset + + +def prepare_train_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Put all padded data needed by training into a list. + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_word = trg_word.reshape(-1, trg_max_len) + trg_pos = trg_pos.reshape(-1, trg_max_len) + + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + lbl_word = lbl_word.reshape(-1, 1) + lbl_weight = lbl_weight.reshape(-1, 1) + + data_inputs = [ + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + return data_inputs + + +def prepare_infer_input(insts, src_pad_idx, n_head): + """ + Put all padded data needed by beam search decoder into a list. + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, 1, 1]).astype("float32") + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) + + data_inputs = [ + src_word, src_pos, src_slf_attn_bias, trg_src_attn_bias + ] + return data_inputs + + +def pad_batch_data(insts, + pad_idx, + n_head, + is_target=False, + is_label=False, + return_attn_bias=True, + return_max_len=True, + return_num_token=False): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if is_label: # label weight + inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst)) + for inst in insts]) + return_list += [inst_weight.astype("float32").reshape([-1, 1])] + else: # position data + inst_pos = np.array([ + list(range(0, len(inst))) + [0] * (max_len - len(inst)) + for inst in insts + ]) + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones( + (inst_data.shape[0], max_len, max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, + 1).reshape([-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + if return_num_token: + num_token = 0 + for inst in insts: + num_token += len(inst) + return_list += [num_token] + return return_list if len(return_list) > 1 else return_list[0] + + +class SortType(object): + GLOBAL = 'global' + POOL = 'pool' + NONE = "none" + + +class Converter(object): + def __init__(self, vocab, beg, end, unk, delimiter, add_beg): + self._vocab = vocab + self._beg = beg + self._end = end + self._unk = unk + self._delimiter = delimiter + self._add_beg = add_beg + + def __call__(self, sentence): + return ([self._beg] if self._add_beg else []) + [ + self._vocab.get(w, self._unk) + for w in sentence.split(self._delimiter) + ] + [self._end] + + +class ComposedConverter(object): + def __init__(self, converters): + self._converters = converters + + def __call__(self, parallel_sentence): + return [ + self._converters[i](parallel_sentence[i]) + for i in range(len(self._converters)) + ] + + +class SentenceBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self._batch_size = batch_size + + def append(self, info): + self.batch.append(info) + if len(self.batch) == self._batch_size: + tmp = self.batch + self.batch = [] + return tmp + + +class TokenBatchCreator(object): + def __init__(self, batch_size): + self.batch = [] + self.max_len = -1 + self._batch_size = batch_size + + def append(self, info): + cur_len = info.max_len + max_len = max(self.max_len, cur_len) + if max_len * (len(self.batch) + 1) > self._batch_size: + result = self.batch + self.batch = [info] + self.max_len = cur_len + return result + else: + self.max_len = max_len + self.batch.append(info) + + +class SampleInfo(object): + def __init__(self, i, max_len, min_len): + self.i = i + self.min_len = min_len + self.max_len = max_len + + +class MinMaxFilter(object): + def __init__(self, max_len, min_len, underlying_creator): + self._min_len = min_len + self._max_len = max_len + self._creator = underlying_creator + + def append(self, info): + if info.max_len > self._max_len or info.min_len < self._min_len: + return + else: + return self._creator.append(info) + + @property + def batch(self): + return self._creator.batch + + +class Seq2SeqDataset(Dataset): + def __init__(self, + src_vocab_fpath, + trg_vocab_fpath, + fpattern, + tar_fname=None, + field_delimiter="\t", + token_delimiter=" ", + start_mark="", + end_mark="", + unk_mark="", + only_src=False): + # convert str to bytes, and use byte data + field_delimiter = field_delimiter.encode("utf8") + token_delimiter = token_delimiter.encode("utf8") + start_mark = start_mark.encode("utf8") + end_mark = end_mark.encode("utf8") + unk_mark = unk_mark.encode("utf8") + self._src_vocab = self.load_dict(src_vocab_fpath) + self._trg_vocab = self.load_dict(trg_vocab_fpath) + self._bos_idx = self._src_vocab[start_mark] + self._eos_idx = self._src_vocab[end_mark] + self._unk_idx = self._src_vocab[unk_mark] + self._only_src = only_src + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter + self.load_src_trg_ids(fpattern, tar_fname) + + def load_src_trg_ids(self, fpattern, tar_fname): + converters = [ + Converter(vocab=self._src_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=False) + ] + if not self._only_src: + converters.append( + Converter(vocab=self._trg_vocab, + beg=self._bos_idx, + end=self._eos_idx, + unk=self._unk_idx, + delimiter=self._token_delimiter, + add_beg=True)) + + converters = ComposedConverter(converters) + + self._src_seq_ids = [] + self._trg_seq_ids = None if self._only_src else [] + self._sample_infos = [] + + for i, line in enumerate(self._load_lines(fpattern, tar_fname)): + src_trg_ids = converters(line) + self._src_seq_ids.append(src_trg_ids[0]) + lens = [len(src_trg_ids[0])] + if not self._only_src: + self._trg_seq_ids.append(src_trg_ids[1]) + lens.append(len(src_trg_ids[1])) + self._sample_infos.append(SampleInfo(i, max(lens), min(lens))) + + def _load_lines(self, fpattern, tar_fname): + fpaths = glob.glob(fpattern) + assert len(fpaths) > 0, "no matching file to the provided data path" + + if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): + if tar_fname is None: + raise Exception("If tar file provided, please set tar_fname.") + + f = tarfile.open(fpaths[0], "rb") + for line in f.extractfile(tar_fname): + fields = line.strip(b"\n").split(self._field_delimiter) + if (not self._only_src + and len(fields) == 2) or (self._only_src + and len(fields) == 1): + yield fields + else: + for fpath in fpaths: + if not os.path.isfile(fpath): + raise IOError("Invalid file: %s" % fpath) + + with open(fpath, "rb") as f: + for line in f: + fields = line.strip(b"\n").split(self._field_delimiter) + if (not self._only_src and len(fields) == 2) or ( + self._only_src and len(fields) == 1): + yield fields + + @staticmethod + def load_dict(dict_path, reverse=False): + word_dict = {} + with open(dict_path, "rb") as fdict: + for idx, line in enumerate(fdict): + if reverse: + word_dict[idx] = line.strip(b"\n") + else: + word_dict[line.strip(b"\n")] = idx + return word_dict + + def get_vocab_summary(self): + return len(self._src_vocab), len( + self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx + + def __getitem__(self, idx): + return (self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1], + self._trg_seq_ids[idx][1:] + ) if not self._only_src else self._src_seq_ids[idx] + + def __len__(self): + return len(self._sample_infos) + + +class Seq2SeqBatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + pool_size=10000, + sort_type=SortType.NONE, + min_length=0, + max_length=100, + shuffle=False, + shuffle_batch=False, + use_token_batch=False, + clip_last_batch=False, + seed=0): + for arg, value in locals().items(): + if arg != "self": + setattr(self, "_" + arg, value) + self._random = np.random + self._random.seed(seed) + # for multi-devices + self._nranks = ParallelEnv().nranks + self._local_rank = ParallelEnv().local_rank + self._device_id = ParallelEnv().dev_id + + def __iter__(self): + # global sort or global shuffle + if self._sort_type == SortType.GLOBAL: + infos = sorted(self._dataset._sample_infos, + key=lambda x: x.max_len) + else: + if self._shuffle: + infos = self._dataset._sample_infos + self._random.shuffle(infos) + else: + infos = self._dataset._sample_infos + + if self._sort_type == SortType.POOL: + reverse = True + for i in range(0, len(infos), self._pool_size): + # to avoid placing short next to long sentences + reverse = not reverse + infos[i:i + self._pool_size] = sorted( + infos[i:i + self._pool_size], + key=lambda x: x.max_len, + reverse=reverse) + + batches = [] + batch_creator = TokenBatchCreator( + self._batch_size + ) if self._use_token_batch else SentenceBatchCreator(self._batch_size * + self._nranks) + batch_creator = MinMaxFilter(self._max_length, self._min_length, + batch_creator) + + for info in infos: + batch = batch_creator.append(info) + if batch is not None: + batches.append(batch) + + if not self._clip_last_batch and len(batch_creator.batch) != 0: + batches.append(batch_creator.batch) + + if self._shuffle_batch: + self._random.shuffle(batches) + + if not self._use_token_batch: + # when producing batches according to sequence number, to confirm + # neighbor batches which would be feed and run parallel have similar + # length (thus similar computational cost) after shuffle, we as take + # them as a whole when shuffling and split here + batches = [[ + batch[self._batch_size * i:self._batch_size * (i + 1)] + for i in range(self._nranks) + ] for batch in batches] + batches = list(itertools.chain.from_iterable(batches)) + + # for multi-device + for batch_id, batch in enumerate(batches): + if batch_id % self._nranks == self._local_rank: + batch_indices = [info.i for info in batch] + yield batch_indices + if self._local_rank > len(batches) % self._nranks: + yield batch_indices + + def __len__(self): + return 100 diff --git a/transformer/run.sh b/transformer/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..a55d7a7ac747636c2c4fcdfbec1a0f5160a7be05 --- /dev/null +++ b/transformer/run.sh @@ -0,0 +1,41 @@ +python -u train.py \ + --epoch 30 \ + --src_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --training_file wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de.tiny \ + --validation_file wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 4096 \ + --print_step 1 \ + --use_cuda True \ + --random_seed 1000 \ + --save_step 10 \ + --eager_run True + #--init_from_pretrain_model base_model_dygraph/step_100000/ \ + #--init_from_checkpoint trained_models/step_200/transformer + #--n_head 16 \ + #--d_model 1024 \ + #--d_inner_hid 4096 \ + #--prepostprocess_dropout 0.3 +exit + +echo `date` + +python -u predict.py \ + --src_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --predict_file wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \ + --batch_size 64 \ + --init_from_params base_model_dygraph/step_100000/ \ + --beam_size 5 \ + --max_out_len 255 \ + --output_file predict.txt \ + --eager_run True + #--max_length 500 \ + #--n_head 16 \ + #--d_model 1024 \ + #--d_inner_hid 4096 \ + #--prepostprocess_dropout 0.3 + +echo `date` \ No newline at end of file diff --git a/transformer/train.py b/transformer/train.py new file mode 100644 index 0000000000000000000000000000000000000000..557241843b0d039ddc3344d0f6eae6a855af83b9 --- /dev/null +++ b/transformer/train.py @@ -0,0 +1,199 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import six +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from functools import partial + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import to_variable +from paddle.fluid.io import DataLoader + +from utils.configure import PDConfig +from utils.check import check_gpu, check_version + +from model import Input, set_device +from callbacks import ProgBarLogger +from reader import prepare_train_input, Seq2SeqDataset, Seq2SeqBatchSampler +from transformer import Transformer, CrossEntropyCriterion, NoamDecay + + +class LoggerCallback(ProgBarLogger): + def __init__(self, log_freq=1, verbose=2, loss_normalizer=0.): + super(LoggerCallback, self).__init__(log_freq, verbose) + # TODO: wrap these override function to simplify + self.loss_normalizer = loss_normalizer + + def on_train_begin(self, logs=None): + super(LoggerCallback, self).on_train_begin(logs) + self.train_metrics += ["normalized loss", "ppl"] + + def on_train_batch_end(self, step, logs=None): + logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer + logs["ppl"] = np.exp(min(logs["loss"][0], 100)) + super(LoggerCallback, self).on_train_batch_end(step, logs) + + def on_eval_begin(self, logs=None): + super(LoggerCallback, self).on_eval_begin(logs) + self.eval_metrics += ["normalized loss", "ppl"] + + def on_eval_batch_end(self, step, logs=None): + logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer + logs["ppl"] = np.exp(min(logs["loss"][0], 100)) + super(LoggerCallback, self).on_eval_batch_end(step, logs) + + +def do_train(args): + device = set_device("gpu" if args.use_cuda else "cpu") + fluid.enable_dygraph(device) if args.eager_run else None + + # set seed for CE + random_seed = eval(str(args.random_seed)) + if random_seed is not None: + fluid.default_main_program().random_seed = random_seed + fluid.default_startup_program().random_seed = random_seed + + # define inputs + inputs = [ + Input( + [None, None], "int64", name="src_word"), + Input( + [None, None], "int64", name="src_pos"), + Input( + [None, args.n_head, None, None], + "float32", + name="src_slf_attn_bias"), + Input( + [None, None], "int64", name="trg_word"), + Input( + [None, None], "int64", name="trg_pos"), + Input( + [None, args.n_head, None, None], + "float32", + name="trg_slf_attn_bias"), + Input( + [None, args.n_head, None, None], + "float32", + name="trg_src_attn_bias"), + ] + labels = [ + Input( + [None, 1], "int64", name="label"), + Input( + [None, 1], "float32", name="weight"), + ] + + # def dataloader + data_loaders = [None, None] + data_files = [args.training_file, args.validation_file + ] if args.validation_file else [args.training_file] + for i, data_file in enumerate(data_files): + dataset = Seq2SeqDataset( + fpattern=data_file, + src_vocab_fpath=args.src_vocab_fpath, + trg_vocab_fpath=args.trg_vocab_fpath, + token_delimiter=args.token_delimiter, + start_mark=args.special_token[0], + end_mark=args.special_token[1], + unk_mark=args.special_token[2]) + args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ + args.unk_idx = dataset.get_vocab_summary() + batch_sampler = Seq2SeqBatchSampler( + dataset=dataset, + use_token_batch=args.use_token_batch, + batch_size=args.batch_size, + pool_size=args.pool_size, + sort_type=args.sort_type, + shuffle=args.shuffle, + shuffle_batch=args.shuffle_batch, + max_length=args.max_length) + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + feed_list=None if fluid.in_dygraph_mode() else + [x.forward() for x in inputs + labels], + collate_fn=partial( + prepare_train_input, + src_pad_idx=args.eos_idx, + trg_pad_idx=args.eos_idx, + n_head=args.n_head), + num_workers=0, # TODO: use multi-process + return_list=True) + data_loaders[i] = data_loader + train_loader, eval_loader = data_loaders + + # define model + transformer = Transformer( + args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, + args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, + args.d_inner_hid, args.prepostprocess_dropout, args.attention_dropout, + args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, + args.weight_sharing, args.bos_idx, args.eos_idx) + + transformer.prepare( + fluid.optimizer.Adam( + learning_rate=fluid.layers.noam_decay(args.d_model, + args.warmup_steps), + beta1=args.beta1, + beta2=args.beta2, + epsilon=float(args.eps), + parameter_list=transformer.parameters()), + CrossEntropyCriterion(args.label_smooth_eps), + inputs=inputs, + labels=labels) + + ## init from some checkpoint, to resume the previous training + if args.init_from_checkpoint: + transformer.load( + os.path.join(args.init_from_checkpoint, "transformer")) + ## init from some pretrain models, to better solve the current task + if args.init_from_pretrain_model: + transformer.load( + os.path.join(args.init_from_pretrain_model, "transformer"), + reset_optimizer=True) + + # the best cross-entropy value with label smoothing + loss_normalizer = -( + (1. - args.label_smooth_eps) * np.log( + (1. - args.label_smooth_eps)) + args.label_smooth_eps * + np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) + + # model train + transformer.fit(train_data=train_loader, + eval_data=eval_loader, + epochs=1, + eval_freq=1, + save_freq=1, + verbose=2, + callbacks=[ + LoggerCallback( + log_freq=args.print_step, + loss_normalizer=loss_normalizer) + ]) + + +if __name__ == "__main__": + args = PDConfig(yaml_file="./transformer.yaml") + args.build() + args.Print() + check_gpu(args.use_cuda) + check_version() + + do_train(args) diff --git a/transformer/transformer.py b/transformer/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..fd64759c1c03b9f6cf6f38990b7a9f4522b3d409 --- /dev/null +++ b/transformer/transformer.py @@ -0,0 +1,691 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable +from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay +from model import Model, CrossEntropy, Loss +from text import TransformerBeamSearchDecoder, DynamicDecode + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + channels = d_pos_vec + position = np.arange(n_position) + num_timescales = channels // 2 + log_timescale_increment = (np.log(float(1e4) / float(1)) / + (num_timescales - 1)) + inv_timescales = np.exp(np.arange( + num_timescales)) * -log_timescale_increment + scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, + 0) + signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) + signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') + position_enc = signal + return position_enc.astype("float32") + + +class NoamDecay(LearningRateDecay): + """ + learning rate scheduler + """ + + def __init__(self, + d_model, + warmup_steps, + static_lr=2.0, + begin=1, + step=1, + dtype='float32'): + super(NoamDecay, self).__init__(begin, step, dtype) + self.d_model = d_model + self.warmup_steps = warmup_steps + self.static_lr = static_lr + + def step(self): + a = self.create_lr_var(self.step_num**-0.5) + b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num) + lr_value = (self.d_model**-0.5) * layers.elementwise_min( + a, b) * self.static_lr + return lr_value + + +class PrePostProcessLayer(Layer): + """ + PrePostProcessLayer + """ + + def __init__(self, process_cmd, d_model, dropout_rate): + super(PrePostProcessLayer, self).__init__() + self.process_cmd = process_cmd + self.functors = [] + for cmd in self.process_cmd: + if cmd == "a": # add residual connection + self.functors.append(lambda x, y: x + y if y else x) + elif cmd == "n": # add layer normalization + self.functors.append( + self.add_sublayer( + "layer_norm_%d" % len( + self.sublayers(include_sublayers=False)), + LayerNorm( + normalized_shape=d_model, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.))))) + elif cmd == "d": # add dropout + self.functors.append(lambda x: layers.dropout( + x, dropout_prob=dropout_rate, is_test=False) + if dropout_rate else x) + + def forward(self, x, residual=None): + for i, cmd in enumerate(self.process_cmd): + if cmd == "a": + x = self.functors[i](x, residual) + else: + x = self.functors[i](x) + return x + + +class MultiHeadAttention(Layer): + """ + Multi-Head Attention + """ + + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.d_model = d_model + self.dropout_rate = dropout_rate + self.q_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.k_fc = Linear( + input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) + self.v_fc = Linear( + input_dim=d_model, output_dim=d_value * n_head, bias_attr=False) + self.proj_fc = Linear( + input_dim=d_value * n_head, output_dim=d_model, bias_attr=False) + + def _prepare_qkv(self, queries, keys, values, cache=None): + if keys is None: # self-attention + keys, values = queries, queries + static_kv = False + else: # cross-attention + static_kv = True + + q = self.q_fc(queries) + q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = layers.transpose(x=q, perm=[0, 2, 1, 3]) + + if cache is not None and static_kv and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k = cache["static_k"] + v = cache["static_v"] + else: + k = self.k_fc(keys) + v = self.v_fc(values) + k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + + if cache is not None: + if static_kv and not "static_k" in cache: + # for encoder-decoder attention in inference and has not cached + cache["static_k"], cache["static_v"] = k, v + elif not static_kv: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = layers.concat([cache_k, k], axis=2) + v = layers.concat([cache_v, v], axis=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def forward(self, queries, keys, values, attn_bias, cache=None): + # compute q ,k ,v + q, k, v = self._prepare_qkv(queries, keys, values, cache) + + # scale dot product attention + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) + if attn_bias: + product += attn_bias + weights = layers.softmax(product) + if self.dropout_rate: + weights = layers.dropout( + weights, dropout_prob=self.dropout_rate, is_test=False) + + out = layers.matmul(weights, v) + + # combine heads + out = layers.transpose(out, perm=[0, 2, 1, 3]) + out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.proj_fc(out) + return out + + def cal_kv(self, keys, values): + k = self.k_fc(keys) + v = self.v_fc(values) + k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + return k, v + + +class FFN(Layer): + """ + Feed-Forward Network + """ + + def __init__(self, d_inner_hid, d_model, dropout_rate): + super(FFN, self).__init__() + self.dropout_rate = dropout_rate + self.fc1 = Linear( + input_dim=d_model, output_dim=d_inner_hid, act="relu") + self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model) + + def forward(self, x): + hidden = self.fc1(x) + if self.dropout_rate: + hidden = layers.dropout( + hidden, dropout_prob=self.dropout_rate, is_test=False) + out = self.fc2(hidden) + return out + + +class EncoderLayer(Layer): + """ + EncoderLayer + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__() + + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class Encoder(Layer): + """ + encoder + """ + + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(Encoder, self).__init__() + + self.encoder_layers = list() + for i in range(n_layer): + self.encoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, + postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + for encoder_layer in self.encoder_layers: + enc_output = encoder_layer(enc_input, attn_bias) + enc_input = enc_output + + return self.processer(enc_output) + + +class Embedder(Layer): + """ + Word Embedding + Position Encoding + """ + + def __init__(self, vocab_size, emb_dim, bos_idx=0): + super(Embedder, self).__init__() + + self.word_embedder = Embedding( + size=[vocab_size, emb_dim], + padding_idx=bos_idx, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Normal(0., emb_dim**-0.5))) + + def forward(self, word): + word_emb = self.word_embedder(word) + return word_emb + + +class WrapEncoder(Layer): + """ + embedder + encoder + """ + + def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key, + d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd, word_embedder): + super(WrapEncoder, self).__init__() + + self.emb_dropout = prepostprocess_dropout + self.emb_dim = d_model + self.word_embedder = word_embedder + self.pos_encoder = Embedding( + size=[max_length, self.emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + position_encoding_init(max_length, self.emb_dim)), + trainable=False)) + + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, src_word, src_pos, src_slf_attn_bias): + word_emb = self.word_embedder(src_word) + word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) + pos_enc = self.pos_encoder(src_pos) + pos_enc.stop_gradient = True + emb = word_emb + pos_enc + enc_input = layers.dropout( + emb, dropout_prob=self.emb_dropout, + is_test=False) if self.emb_dropout else emb + + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class DecoderLayer(Layer): + """ + decoder + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + super(DecoderLayer, self).__init__() + + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, + dec_input, + enc_output, + self_attn_bias, + cross_attn_bias, + cache=None): + self_attn_output = self.self_attn( + self.preprocesser1(dec_input), None, None, self_attn_bias, cache) + self_attn_output = self.postprocesser1(self_attn_output, dec_input) + + cross_attn_output = self.cross_attn( + self.preprocesser2(self_attn_output), enc_output, enc_output, + cross_attn_bias, cache) + cross_attn_output = self.postprocesser2(cross_attn_output, + self_attn_output) + + ffn_output = self.ffn(self.preprocesser3(cross_attn_output)) + ffn_output = self.postprocesser3(ffn_output, cross_attn_output) + + return ffn_output + + +class Decoder(Layer): + """ + decoder + """ + + def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, relu_dropout, + preprocess_cmd, postprocess_cmd): + super(Decoder, self).__init__() + + self.decoder_layers = list() + for i in range(n_layer): + self.decoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + DecoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, + postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, + dec_input, + enc_output, + self_attn_bias, + cross_attn_bias, + caches=None): + for i, decoder_layer in enumerate(self.decoder_layers): + dec_output = decoder_layer(dec_input, enc_output, self_attn_bias, + cross_attn_bias, None + if caches is None else caches[i]) + dec_input = dec_output + + return self.processer(dec_output) + + def prepare_static_cache(self, enc_output): + return [ + dict( + zip(("static_k", "static_v"), + decoder_layer.cross_attn.cal_kv(enc_output, enc_output))) + for decoder_layer in self.decoder_layers + ] + + +class WrapDecoder(Layer): + """ + embedder + decoder + """ + + def __init__(self, trg_vocab_size, max_length, n_layer, n_head, d_key, + d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd, share_input_output_embed, word_embedder): + super(WrapDecoder, self).__init__() + + self.emb_dropout = prepostprocess_dropout + self.emb_dim = d_model + self.word_embedder = word_embedder + self.pos_encoder = Embedding( + size=[max_length, self.emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer( + position_encoding_init(max_length, self.emb_dim)), + trainable=False)) + + self.decoder = Decoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + if share_input_output_embed: + self.linear = lambda x: layers.matmul(x=x, + y=self.word_embedder. + word_embedder.weight, + transpose_y=True) + else: + self.linear = Linear( + input_dim=d_model, output_dim=trg_vocab_size, bias_attr=False) + + def forward(self, + trg_word, + trg_pos, + trg_slf_attn_bias, + trg_src_attn_bias, + enc_output, + caches=None): + word_emb = self.word_embedder(trg_word) + word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) + pos_enc = self.pos_encoder(trg_pos) + pos_enc.stop_gradient = True + emb = word_emb + pos_enc + dec_input = layers.dropout( + emb, dropout_prob=self.emb_dropout, + is_test=False) if self.emb_dropout else emb + dec_output = self.decoder(dec_input, enc_output, trg_slf_attn_bias, + trg_src_attn_bias, caches) + dec_output = layers.reshape( + dec_output, + shape=[-1, dec_output.shape[-1]], ) + logits = self.linear(dec_output) + return logits + + +class CrossEntropyCriterion(Loss): + def __init__(self, label_smooth_eps): + super(CrossEntropyCriterion, self).__init__() + self.label_smooth_eps = label_smooth_eps + + def forward(self, outputs, labels): + predict, (label, weights) = outputs[0], labels + if self.label_smooth_eps: + label = layers.label_smooth( + label=layers.one_hot( + input=label, depth=predict.shape[-1]), + epsilon=self.label_smooth_eps) + + cost = layers.softmax_with_cross_entropy( + logits=predict, + label=label, + soft_label=True if self.label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + token_num.stop_gradient = True + avg_cost = sum_cost / token_num + return avg_cost + + +class Transformer(Model): + """ + model + """ + + def __init__(self, + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_id=0, + eos_id=1): + super(Transformer, self).__init__() + src_word_embedder = Embedder( + vocab_size=src_vocab_size, emb_dim=d_model, bos_idx=bos_id) + self.encoder = WrapEncoder( + src_vocab_size, max_length, n_layer, n_head, d_key, d_value, + d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd, src_word_embedder) + if weight_sharing: + assert src_vocab_size == trg_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + trg_word_embedder = src_word_embedder + else: + trg_word_embedder = Embedder( + vocab_size=trg_vocab_size, emb_dim=d_model, bos_idx=bos_id) + self.decoder = WrapDecoder( + trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, + d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, + trg_word_embedder) + + self.trg_vocab_size = trg_vocab_size + self.n_layer = n_layer + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + + def forward(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias): + enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) + predict = self.decoder(trg_word, trg_pos, trg_slf_attn_bias, + trg_src_attn_bias, enc_output) + return predict + + +class TransfomerCell(object): + """ + Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be + used as RNNCell + """ + + def __init__(self, decoder): + self.decoder = decoder + + def __call__(self, inputs, states, trg_src_attn_bias, enc_output, + static_caches): + trg_word, trg_pos = inputs + for cache, static_cache in zip(states, static_caches): + cache.update(static_cache) + logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, + enc_output, states) + new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states] + return logits, new_states + + +class InferTransformer(Transformer): + """ + model for prediction + """ + + def __init__(self, + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_id=0, + eos_id=1, + beam_size=4, + max_out_len=256): + args = dict(locals()) + args.pop("self") + args.pop("__class__", None) # py3 + self.beam_size = args.pop("beam_size") + self.max_out_len = args.pop("max_out_len") + super(InferTransformer, self).__init__(**args) + cell = TransfomerCell(self.decoder) + self.beam_search_decoder = DynamicDecode( + TransformerBeamSearchDecoder( + cell, bos_id, eos_id, beam_size, var_dim_in_state=2), + max_out_len, + is_test=True) + + def forward(self, src_word, src_pos, src_slf_attn_bias, trg_src_attn_bias): + enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) + ## init states (caches) for transformer, need to be updated according to selected beam + caches = [{ + "k": layers.fill_constant_batch_size_like( + input=enc_output, + shape=[-1, self.n_head, 0, self.d_key], + dtype=enc_output.dtype, + value=0), + "v": layers.fill_constant_batch_size_like( + input=enc_output, + shape=[-1, self.n_head, 0, self.d_value], + dtype=enc_output.dtype, + value=0), + } for i in range(self.n_layer)] + enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + enc_output, self.beam_size) + trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + trg_src_attn_bias, self.beam_size) + static_caches = self.decoder.decoder.prepare_static_cache(enc_output) + rs, _ = self.beam_search_decoder( + inits=caches, + enc_output=enc_output, + trg_src_attn_bias=trg_src_attn_bias, + static_caches=static_caches) + return rs diff --git a/transformer/transformer.yaml b/transformer/transformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58bc72ba3d4ed6e94969778e34e4e1cea812ed57 --- /dev/null +++ b/transformer/transformer.yaml @@ -0,0 +1,112 @@ +# used for continuous evaluation +enable_ce: False + +eager_run: False + +# The frequency to save trained models when training. +save_step: 10000 +# The frequency to fetch and print output when training. +print_step: 100 +# path of the checkpoint, to resume the previous training +init_from_checkpoint: "" +# path of the pretrain model, to better solve the current task +init_from_pretrain_model: "" +# path of trained parameter, to make prediction +init_from_params: "trained_params/step_100000/" +# the directory for saving model +save_model: "trained_models" +# the directory for saving inference model. +inference_model_dir: "infer_model" +# Set seed for CE or debug +random_seed: None +# The pattern to match training data files. +training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de" +# The pattern to match validation data files. +validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de" +# The pattern to match test data files. +predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de" +# The file to output the translation results of predict_file to. +output_file: "predict.txt" +# The path of vocabulary file of source language. +src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" +# The path of vocabulary file of target language. +trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000" +# The , and tokens in the dictionary. +special_token: ["", "", ""] + +# whether to use cuda +use_cuda: True + +# args for reader, see reader.py for details +token_delimiter: " " +use_token_batch: True +pool_size: 200000 +sort_type: "pool" +shuffle: True +shuffle_batch: True +batch_size: 4096 + +# Hyparams for training: +# the number of epoches for training +epoch: 30 +# the hyper parameters for Adam optimizer. +# This static learning_rate will be multiplied to the LearningRateScheduler +# derived learning rate the to get the final learning rate. +learning_rate: 2.0 +beta1: 0.9 +beta2: 0.997 +eps: 1e-9 +# the parameters for learning rate scheduling. +warmup_steps: 8000 +# the weight used to mix up the ground-truth distribution and the fixed +# uniform distribution in label smoothing when training. +# Set this as zero if label smoothing is not wanted. +label_smooth_eps: 0.1 + +# Hyparams for generation: +# the parameters for beam search. +beam_size: 5 +max_out_len: 256 +# the number of decoded sentences to output. +n_best: 1 + +# Hyparams for model: +# These following five vocabularies related configurations will be set +# automatically according to the passed vocabulary path and special tokens. +# size of source word dictionary. +src_vocab_size: 10000 +# size of target word dictionay +trg_vocab_size: 10000 +# index for token +bos_idx: 0 +# index for token +eos_idx: 1 +# index for token +unk_idx: 2 +# max length of sequences deciding the size of position encoding table. +max_length: 256 +# the dimension for word embeddings, which is also the last dimension of +# the input and output of multi-head attention, position-wise feed-forward +# networks, encoder and decoder. +d_model: 512 +# size of the hidden layer in position-wise feed-forward networks. +d_inner_hid: 2048 +# the dimension that keys are projected to for dot-product attention. +d_key: 64 +# the dimension that values are projected to for dot-product attention. +d_value: 64 +# number of head used in multi-head attention. +n_head: 8 +# number of sub-layers to be stacked in the encoder and decoder. +n_layer: 6 +# dropout rates of different modules. +prepostprocess_dropout: 0.1 +attention_dropout: 0.1 +relu_dropout: 0.1 +# to process before each sub-layer +preprocess_cmd: "n" # layer normalization +# to process after each sub-layer +postprocess_cmd: "da" # dropout + residual connection +# the flag indicating whether to share embedding and softmax weights. +# vocabularies in source and target should be same for weight sharing. +weight_sharing: True diff --git a/transformer/utils/__init__.py b/transformer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/transformer/utils/check.py b/transformer/utils/check.py new file mode 100644 index 0000000000000000000000000000000000000000..305fa3705f5c313569986cbdb15c8afeda5a79c1 --- /dev/null +++ b/transformer/utils/check.py @@ -0,0 +1,61 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle.fluid as fluid + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['check_gpu', 'check_version'] + + +def check_gpu(use_gpu): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config use_gpu cannot be set as true while you are " \ + "using paddlepaddle cpu version ! \nPlease try: \n" \ + "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ + "\t2. Set use_gpu as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and not fluid.is_compiled_with_cuda(): + logger.error(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + logger.error(err) + sys.exit(1) diff --git a/transformer/utils/configure.py b/transformer/utils/configure.py new file mode 100644 index 0000000000000000000000000000000000000000..67e601282fee572518435eaed38a4ed8e26fc5f9 --- /dev/null +++ b/transformer/utils/configure.py @@ -0,0 +1,350 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import argparse +import json +import yaml +import six +import logging + +logging_only_message = "%(message)s" +logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s" + + +class JsonConfig(object): + """ + A high-level api for handling json configure file. + """ + + def __init__(self, config_path): + self._config_dict = self._parse(config_path) + + def _parse(self, config_path): + try: + with open(config_path) as json_file: + config_dict = json.load(json_file) + except: + raise IOError("Error in parsing bert model config file '%s'" % + config_path) + else: + return config_dict + + def __getitem__(self, key): + return self._config_dict[key] + + def print_config(self): + for arg, value in sorted(six.iteritems(self._config_dict)): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +class ArgumentGroup(object): + def __init__(self, parser, title, des): + self._group = parser.add_argument_group(title=title, description=des) + + def add_arg(self, name, type, default, help, **kwargs): + type = str2bool if type == bool else type + self._group.add_argument( + "--" + name, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +class ArgConfig(object): + """ + A high-level api for handling argument configs. + """ + + def __init__(self): + parser = argparse.ArgumentParser() + + train_g = ArgumentGroup(parser, "training", "training options.") + train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") + train_g.add_arg("learning_rate", float, 5e-5, + "Learning rate used to train with warmup.") + train_g.add_arg( + "lr_scheduler", + str, + "linear_warmup_decay", + "scheduler of learning rate.", + choices=['linear_warmup_decay', 'noam_decay']) + train_g.add_arg("weight_decay", float, 0.01, + "Weight decay rate for L2 regularizer.") + train_g.add_arg( + "warmup_proportion", float, 0.1, + "Proportion of training steps to perform linear learning rate warmup for." + ) + train_g.add_arg("save_steps", int, 1000, + "The steps interval to save checkpoints.") + train_g.add_arg("use_fp16", bool, False, + "Whether to use fp16 mixed precision training.") + train_g.add_arg( + "loss_scaling", float, 1.0, + "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled." + ) + train_g.add_arg("pred_dir", str, None, + "Path to save the prediction results") + + log_g = ArgumentGroup(parser, "logging", "logging related.") + log_g.add_arg("skip_steps", int, 10, + "The steps interval to print loss.") + log_g.add_arg("verbose", bool, False, "Whether to output verbose log.") + + run_type_g = ArgumentGroup(parser, "run_type", "running type options.") + run_type_g.add_arg("use_cuda", bool, True, + "If set, use GPU for training.") + run_type_g.add_arg( + "use_fast_executor", bool, False, + "If set, use fast parallel executor (in experiment).") + run_type_g.add_arg( + "num_iteration_per_drop_scope", int, 1, + "Ihe iteration intervals to clean up temporary variables.") + run_type_g.add_arg("do_train", bool, True, + "Whether to perform training.") + run_type_g.add_arg("do_predict", bool, True, + "Whether to perform prediction.") + + custom_g = ArgumentGroup(parser, "customize", "customized options.") + + self.custom_g = custom_g + + self.parser = parser + + def add_arg(self, name, dtype, default, descrip): + self.custom_g.add_arg(name, dtype, default, descrip) + + def build_conf(self): + return self.parser.parse_args() + + +def str2bool(v): + # because argparse does not support to parse "true, False" as python + # boolean directly + return v.lower() in ("true", "t", "1") + + +def print_arguments(args, log=None): + if not log: + print('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + else: + log.info('----------- Configuration Arguments -----------') + for arg, value in sorted(six.iteritems(vars(args))): + log.info('%s: %s' % (arg, value)) + log.info('------------------------------------------------') + + +class PDConfig(object): + """ + A high-level API for managing configuration files in PaddlePaddle. + Can jointly work with command-line-arugment, json files and yaml files. + """ + + def __init__(self, json_file="", yaml_file="", fuse_args=True): + """ + Init funciton for PDConfig. + json_file: the path to the json configure file. + yaml_file: the path to the yaml configure file. + fuse_args: if fuse the json/yaml configs with argparse. + """ + assert isinstance(json_file, str) + assert isinstance(yaml_file, str) + + if json_file != "" and yaml_file != "": + raise Warning( + "json_file and yaml_file can not co-exist for now. please only use one configure file type." + ) + return + + self.args = None + self.arg_config = {} + self.json_config = {} + self.yaml_config = {} + + parser = argparse.ArgumentParser() + + self.default_g = ArgumentGroup(parser, "default", "default options.") + self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.") + self.json_g = ArgumentGroup(parser, "json", "options from json.") + self.com_g = ArgumentGroup(parser, "custom", "customized options.") + + self.default_g.add_arg("do_train", bool, False, + "Whether to perform training.") + self.default_g.add_arg("do_predict", bool, False, + "Whether to perform predicting.") + self.default_g.add_arg("do_eval", bool, False, + "Whether to perform evaluating.") + self.default_g.add_arg("do_save_inference_model", bool, False, + "Whether to perform model saving for inference.") + + # NOTE: args for profiler + self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)") + self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)") + self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)") + + self.parser = parser + + if json_file != "": + self.load_json(json_file, fuse_args=fuse_args) + + if yaml_file: + self.load_yaml(yaml_file, fuse_args=fuse_args) + + def load_json(self, file_path, fuse_args=True): + + if not os.path.exists(file_path): + raise Warning("the json file %s does not exist." % file_path) + return + + with open(file_path, "r") as fin: + self.json_config = json.loads(fin.read()) + fin.close() + + if fuse_args: + for name in self.json_config: + if isinstance(self.json_config[name], list): + self.json_g.add_arg( + name, + type(self.json_config[name][0]), + self.json_config[name], + "This is from %s" % file_path, + nargs=len(self.json_config[name])) + continue + if not isinstance(self.json_config[name], int) \ + and not isinstance(self.json_config[name], float) \ + and not isinstance(self.json_config[name], str) \ + and not isinstance(self.json_config[name], bool): + + continue + + self.json_g.add_arg(name, + type(self.json_config[name]), + self.json_config[name], + "This is from %s" % file_path) + + def load_yaml(self, file_path, fuse_args=True): + + if not os.path.exists(file_path): + raise Warning("the yaml file %s does not exist." % file_path) + return + + with open(file_path, "r") as fin: + self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) + fin.close() + + if fuse_args: + for name in self.yaml_config: + if isinstance(self.yaml_config[name], list): + self.yaml_g.add_arg( + name, + type(self.yaml_config[name][0]), + self.yaml_config[name], + "This is from %s" % file_path, + nargs=len(self.yaml_config[name])) + continue + + if not isinstance(self.yaml_config[name], int) \ + and not isinstance(self.yaml_config[name], float) \ + and not isinstance(self.yaml_config[name], str) \ + and not isinstance(self.yaml_config[name], bool): + + continue + + self.yaml_g.add_arg(name, + type(self.yaml_config[name]), + self.yaml_config[name], + "This is from %s" % file_path) + + def build(self): + self.args = self.parser.parse_args() + self.arg_config = vars(self.args) + + def __add__(self, new_arg): + assert isinstance(new_arg, list) or isinstance(new_arg, tuple) + assert len(new_arg) >= 3 + assert self.args is None + + name = new_arg[0] + dtype = new_arg[1] + dvalue = new_arg[2] + desc = new_arg[3] if len( + new_arg) == 4 else "Description is not provided." + + self.com_g.add_arg(name, dtype, dvalue, desc) + + return self + + def __getattr__(self, name): + if name in self.arg_config: + return self.arg_config[name] + + if name in self.json_config: + return self.json_config[name] + + if name in self.yaml_config: + return self.yaml_config[name] + + raise Warning("The argument %s is not defined." % name) + + def Print(self): + + print("-" * 70) + for name in self.arg_config: + print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name]))) + + for name in self.json_config: + if name not in self.arg_config: + print("%s:\t\t\t\t%s" % + (str(name), str(self.json_config[name]))) + + for name in self.yaml_config: + if name not in self.arg_config: + print("%s:\t\t\t\t%s" % + (str(name), str(self.yaml_config[name]))) + + print("-" * 70) + + +if __name__ == "__main__": + """ + pd_config = PDConfig(json_file = "./test/bert_config.json") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + + pd_config = PDConfig(yaml_file = "./test/bert_config.yaml") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + """ + + pd_config = PDConfig(yaml_file="./test/bert_config.yaml") + pd_config += ("my_age", int, 18, "I am forever 18.") + pd_config.build() + + print(pd_config.do_train) + print(pd_config.hidden_size) + print(pd_config.my_age) diff --git a/yolov3.py b/yolov3.py deleted file mode 100644 index 6c609f24dce60293ee42a599324d595a6875a0f6..0000000000000000000000000000000000000000 --- a/yolov3.py +++ /dev/null @@ -1,568 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import print_function - -import argparse -import contextlib -import os -import random -import time - -from functools import partial - -import cv2 -import numpy as np -from pycocotools.coco import COCO - -import paddle -import paddle.fluid as fluid -from paddle.fluid.dygraph.nn import Conv2D -from paddle.fluid.param_attr import ParamAttr -from paddle.fluid.regularizer import L2Decay - -from model import Model, Loss, Input -from resnet import ResNet, ConvBNLayer - -import logging -FORMAT = '%(asctime)s-%(levelname)s: %(message)s' -logging.basicConfig(level=logging.INFO, format=FORMAT) -logger = logging.getLogger(__name__) - - -# XXX transfer learning -class ResNetBackBone(ResNet): - def __init__(self, depth=50): - super(ResNetBackBone, self).__init__(depth=depth) - delattr(self, 'fc') - - def forward(self, inputs): - x = self.conv(inputs) - x = self.pool(x) - outputs = [] - for layer in self.layers: - x = layer(x) - outputs.append(x) - return outputs - - -class YoloDetectionBlock(fluid.dygraph.Layer): - def __init__(self, num_channels, num_filters): - super(YoloDetectionBlock, self).__init__() - - assert num_filters % 2 == 0, \ - "num_filters {} cannot be divided by 2".format(num_filters) - - self.conv0 = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=1, - act='leaky_relu') - self.conv1 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters * 2, - filter_size=3, - act='leaky_relu') - self.conv2 = ConvBNLayer( - num_channels=num_filters * 2, - num_filters=num_filters, - filter_size=1, - act='leaky_relu') - self.conv3 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters * 2, - filter_size=3, - act='leaky_relu') - self.route = ConvBNLayer( - num_channels=num_filters * 2, - num_filters=num_filters, - filter_size=1, - act='leaky_relu') - self.tip = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters * 2, - filter_size=3, - act='leaky_relu') - - def forward(self, inputs): - out = self.conv0(inputs) - out = self.conv1(out) - out = self.conv2(out) - out = self.conv3(out) - route = self.route(out) - tip = self.tip(route) - return route, tip - - -class YOLOv3(Model): - def __init__(self, num_classes=80): - super(YOLOv3, self).__init__() - self.num_classes = num_classes - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] - self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] - self.valid_thresh = 0.005 - self.nms_thresh = 0.45 - self.nms_topk = 400 - self.nms_posk = 100 - self.draw_thresh = 0.5 - - self.backbone = ResNetBackBone() - self.block_outputs = [] - self.yolo_blocks = [] - self.route_blocks = [] - - for idx, num_chan in enumerate([2048, 1280, 640]): - yolo_block = self.add_sublayer( - "detecton_block_{}".format(idx), - YoloDetectionBlock(num_chan, num_filters=512 // (2**idx))) - self.yolo_blocks.append(yolo_block) - - num_filters = len(self.anchor_masks[idx]) * (self.num_classes + 5) - - block_out = self.add_sublayer( - "block_out_{}".format(idx), - Conv2D(num_channels=1024 // (2**idx), - num_filters=num_filters, - filter_size=1, - param_attr=ParamAttr( - initializer=fluid.initializer.Normal(0., 0.02)), - bias_attr=ParamAttr( - initializer=fluid.initializer.Constant(0.0), - regularizer=L2Decay(0.)))) - self.block_outputs.append(block_out) - if idx < 2: - route = self.add_sublayer( - "route_{}".format(idx), - ConvBNLayer(num_channels=512 // (2**idx), - num_filters=256 // (2**idx), - filter_size=1, - act='leaky_relu')) - self.route_blocks.append(route) - - def forward(self, inputs, img_info): - outputs = [] - boxes = [] - scores = [] - downsample = 32 - - feats = self.backbone(inputs) - feats = feats[::-1][:len(self.anchor_masks)] - route = None - for idx, feat in enumerate(feats): - if idx > 0: - feat = fluid.layers.concat(input=[route, feat], axis=1) - route, tip = self.yolo_blocks[idx](feat) - block_out = self.block_outputs[idx](tip) - outputs.append(block_out) - - if idx < 2: - route = self.route_blocks[idx](route) - route = fluid.layers.resize_nearest(route, scale=2) - - if self.mode == 'test': - anchor_mask = self.anchor_masks[idx] - mask_anchors = [] - for m in anchor_mask: - mask_anchors.append(self.anchors[2 * m]) - mask_anchors.append(self.anchors[2 * m + 1]) - img_shape = fluid.layers.slice(img_info, axes=[1], starts=[1], ends=[3]) - img_id = fluid.layers.slice(img_info, axes=[1], starts=[0], ends=[1]) - b, s = fluid.layers.yolo_box( - x=block_out, - img_size=img_shape, - anchors=mask_anchors, - class_num=self.num_classes, - conf_thresh=self.valid_thresh, - downsample_ratio=downsample) - - boxes.append(b) - scores.append(fluid.layers.transpose(s, perm=[0, 2, 1])) - - downsample //= 2 - - if self.mode != 'test': - return outputs - - return [img_id, fluid.layers.multiclass_nms( - bboxes=fluid.layers.concat(boxes, axis=1), - scores=fluid.layers.concat(scores, axis=2), - score_threshold=self.valid_thresh, - nms_top_k=self.nms_topk, - keep_top_k=self.nms_posk, - nms_threshold=self.nms_thresh, - background_label=-1)] - - -class YoloLoss(Loss): - def __init__(self, num_classes=80): - super(YoloLoss, self).__init__() - self.num_classes = num_classes - self.ignore_thresh = 0.7 - self.anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, - 59, 119, 116, 90, 156, 198, 373, 326] - self.anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] - - def forward(self, outputs, labels): - downsample = 32 - gt_box, gt_label, gt_score = labels - losses = [] - - for idx, out in enumerate(outputs): - anchor_mask = self.anchor_masks[idx] - loss = fluid.layers.yolov3_loss( - x=out, - gt_box=gt_box, - gt_label=gt_label, - gt_score=gt_score, - anchor_mask=anchor_mask, - downsample_ratio=downsample, - anchors=self.anchors, - class_num=self.num_classes, - ignore_thresh=self.ignore_thresh, - use_label_smooth=True) - loss = fluid.layers.reduce_mean(loss) - losses.append(loss) - downsample //= 2 - return losses - - -def make_optimizer(parameter_list=None): - base_lr = FLAGS.lr - warm_up_iter = 4000 - momentum = 0.9 - weight_decay = 5e-4 - boundaries = [400000, 450000] - values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)] - learning_rate = fluid.layers.piecewise_decay( - boundaries=boundaries, - values=values) - learning_rate = fluid.layers.linear_lr_warmup( - learning_rate=learning_rate, - warmup_steps=warm_up_iter, - start_lr=0.0, - end_lr=base_lr) - optimizer = fluid.optimizer.Momentum( - learning_rate=learning_rate, - regularization=fluid.regularizer.L2Decay(weight_decay), - momentum=momentum, - parameter_list=parameter_list) - return optimizer - - -def _iou_matrix(a, b): - tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) - br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) - area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) - area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) - area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) - area_o = (area_a[:, np.newaxis] + area_b - area_i) - return area_i / (area_o + 1e-10) - - -def _crop_box_with_center_constraint(box, crop): - cropped_box = box.copy() - cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) - cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) - cropped_box[:, :2] -= crop[:2] - cropped_box[:, 2:] -= crop[:2] - centers = (box[:, :2] + box[:, 2:]) / 2 - valid = np.logical_and( - crop[:2] <= centers, centers < crop[2:]).all(axis=1) - valid = np.logical_and( - valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) - return cropped_box, np.where(valid)[0] - - -def random_crop(inputs): - aspect_ratios = [.5, 2.] - thresholds = [.0, .1, .3, .5, .7, .9] - scaling = [.3, 1.] - - img, img_ids, gt_box, gt_label = inputs - h, w = img.shape[:2] - - if len(gt_box) == 0: - return inputs - - np.random.shuffle(thresholds) - for thresh in thresholds: - found = False - for i in range(50): - scale = np.random.uniform(*scaling) - min_ar, max_ar = aspect_ratios - ar = np.random.uniform(max(min_ar, scale**2), - min(max_ar, scale**-2)) - crop_h = int(h * scale / np.sqrt(ar)) - crop_w = int(w * scale * np.sqrt(ar)) - crop_y = np.random.randint(0, h - crop_h) - crop_x = np.random.randint(0, w - crop_w) - crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] - iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32)) - if iou.max() < thresh: - continue - - cropped_box, valid_ids = _crop_box_with_center_constraint( - gt_box, np.array(crop_box, dtype=np.float32)) - if valid_ids.size > 0: - found = True - break - - if found: - x1, y1, x2, y2 = crop_box - img = img[y1:y2, x1:x2, :] - gt_box = np.take(cropped_box, valid_ids, axis=0) - gt_label = np.take(gt_label, valid_ids, axis=0) - return img, img_ids, gt_box, gt_label - - return inputs - - -# XXX mix up, color distort and random expand are skipped for simplicity -def sample_transform(inputs, mode='train', num_max_boxes=50): - if mode == 'train': - img, img_id, gt_box, gt_label = random_crop(inputs) - else: - img, img_id, gt_box, gt_label = inputs - - h, w = img.shape[:2] - # random flip - if mode == 'train' and np.random.uniform(0., 1.) > .5: - img = img[:, ::-1, :] - if len(gt_box) > 0: - swap = gt_box.copy() - gt_box[:, 0] = w - swap[:, 2] - 1 - gt_box[:, 2] = w - swap[:, 0] - 1 - - if len(gt_label) == 0: - gt_box = np.zeros([num_max_boxes, 4], dtype=np.float32) - gt_label = np.zeros([num_max_boxes], dtype=np.int32) - return img, gt_box, gt_label - - gt_box = gt_box[:num_max_boxes, :] - gt_label = gt_label[:num_max_boxes, 0] - # normalize boxes - gt_box /= np.array([w, h] * 2, dtype=np.float32) - gt_box[:, 2:] = gt_box[:, 2:] - gt_box[:, :2] - gt_box[:, :2] = gt_box[:, :2] + gt_box[:, 2:] / 2. - - pad = num_max_boxes - gt_label.size - gt_box = np.pad(gt_box, ((0, pad), (0, 0)), mode='constant') - gt_label = np.pad(gt_label, ((0, pad)), mode='constant') - - return img, img_id, gt_box, gt_label - - -def batch_transform(batch, mode='train'): - if mode == 'train': - d = np.random.choice( - [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]) - interp = np.random.choice(range(5)) - else: - d = 608 - interp = cv2.INTER_CUBIC - # transpose batch - imgs, img_ids, gt_boxes, gt_labels = list(zip(*batch)) - img_shapes = np.array([[im.shape[0], im.shape[1]] for im in imgs]).astype('int32') - imgs = np.array([cv2.resize( - img, (d, d), interpolation=interp) for img in imgs]) - - # transpose, permute and normalize - imgs = imgs.astype(np.float32)[..., ::-1] - mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) - std = np.array([58.395, 57.120, 57.375], dtype=np.float32) - invstd = 1. / std - imgs -= mean - imgs *= invstd - imgs = imgs.transpose((0, 3, 1, 2)) - - img_ids = np.array(img_ids) - img_info = np.concatenate([img_ids, img_shapes], axis=1) - gt_boxes = np.array(gt_boxes) - gt_labels = np.array(gt_labels) - # XXX since mix up is not used, scores are all ones - gt_scores = np.ones_like(gt_labels, dtype=np.float32) - return [imgs, img_info], [gt_boxes, gt_labels, gt_scores] - - -def coco2017(root_dir, mode='train'): - json_path = os.path.join( - root_dir, 'annotations/instances_{}2017.json'.format(mode)) - coco = COCO(json_path) - img_ids = coco.getImgIds() - imgs = coco.loadImgs(img_ids) - class_map = {v: i + 1 for i, v in enumerate(coco.getCatIds())} - samples = [] - - for img in imgs: - img_path = os.path.join( - root_dir, '{}2017'.format(mode), img['file_name']) - file_path = img_path - width = img['width'] - height = img['height'] - ann_ids = coco.getAnnIds(imgIds=img['id'], iscrowd=False) - anns = coco.loadAnns(ann_ids) - - gt_box = [] - gt_label = [] - - for ann in anns: - x1, y1, w, h = ann['bbox'] - x2 = x1 + w - 1 - y2 = y1 + h - 1 - x1 = np.clip(x1, 0, width - 1) - x2 = np.clip(x2, 0, width - 1) - y1 = np.clip(y1, 0, height - 1) - y2 = np.clip(y2, 0, height - 1) - if ann['area'] <= 0 or x2 < x1 or y2 < y1: - continue - gt_label.append(ann['category_id']) - gt_box.append([x1, y1, x2, y2]) - - gt_box = np.array(gt_box, dtype=np.float32) - gt_label = np.array([class_map[cls] for cls in gt_label], - dtype=np.int32)[:, np.newaxis] - im_id = np.array([img['id']], dtype=np.int32) - - if gt_label.size == 0 and not mode == 'train': - continue - samples.append((file_path, im_id.copy(), gt_box.copy(), gt_label.copy())) - - def iterator(): - if mode == 'train': - np.random.shuffle(samples) - for file_path, im_id, gt_box, gt_label in samples: - img = cv2.imread(file_path) - yield img, im_id, gt_box, gt_label - - return iterator - - -# XXX coco metrics not included for simplicity -def run(model, loader, mode='train'): - total_loss = 0. - total_time = 0. - device_ids = list(range(FLAGS.num_devices)) - start = time.time() - - for idx, batch in enumerate(loader()): - losses = getattr(model, mode)(batch[0], batch[1]) - - total_loss += np.sum(losses) - if idx > 1: # skip first two steps - total_time += time.time() - start - if idx % 10 == 0: - logger.info("{:04d}: loss {:0.3f} time: {:0.3f}".format( - idx, total_loss / (idx + 1), total_time / max(1, (idx - 1)))) - start = time.time() - - -def main(): - @contextlib.contextmanager - def null_guard(): - yield - - epoch = FLAGS.epoch - batch_size = FLAGS.batch_size - guard = fluid.dygraph.guard() if FLAGS.dynamic else null_guard() - - train_loader = fluid.io.xmap_readers( - batch_transform, - paddle.batch( - fluid.io.xmap_readers( - sample_transform, - coco2017(FLAGS.data, 'train'), - process_num=8, - buffer_size=4 * batch_size), - batch_size=batch_size, - drop_last=True), - process_num=2, buffer_size=4) - - val_sample_transform = partial(sample_transform, mode='val') - val_batch_transform = partial(batch_transform, mode='val') - - val_loader = fluid.io.xmap_readers( - val_batch_transform, - paddle.batch( - fluid.io.xmap_readers( - val_sample_transform, - coco2017(FLAGS.data, 'val'), - process_num=8, - buffer_size=4 * batch_size), - batch_size=1), - process_num=2, buffer_size=4) - - if not os.path.exists('yolo_checkpoints'): - os.mkdir('yolo_checkpoints') - - with guard: - NUM_CLASSES = 7 - NUM_MAX_BOXES = 50 - model = YOLOv3(num_classes=NUM_CLASSES) - # XXX transfer learning - if FLAGS.pretrain_weights is not None: - model.backbone.load(FLAGS.pretrain_weights) - if FLAGS.weights is not None: - model.load(FLAGS.weights) - optim = make_optimizer(parameter_list=model.parameters()) - anno_path = os.path.join(FLAGS.data, 'annotations', 'instances_val2017.json') - inputs = [Input([None, 3, None, None], 'float32', name='image'), - Input([None, 3], 'int32', name='img_info')] - labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), - Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'), - Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')] - model.prepare(optim, - YoloLoss(num_classes=NUM_CLASSES), - # For YOLOv3, output variable in train/eval is different, - # which is not supported by metric, add by callback later? - # metrics=COCOMetric(anno_path, with_background=False) - inputs=inputs, - labels = labels) - - for e in range(epoch): - logger.info("======== train epoch {} ========".format(e)) - run(model, train_loader) - model.save('yolo_checkpoints/{:02d}'.format(e)) - logger.info("======== eval epoch {} ========".format(e)) - run(model, val_loader, mode='eval') - # should be called in fit() - for metric in model._metrics: - metric.accumulate() - metric.reset() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser("Yolov3 Training on COCO") - parser.add_argument('data', metavar='DIR', help='path to COCO dataset') - parser.add_argument( - "-d", "--dynamic", action='store_true', help="enable dygraph mode") - parser.add_argument( - "-e", "--epoch", default=300, type=int, help="number of epoch") - parser.add_argument( - '--lr', '--learning-rate', default=0.001, type=float, metavar='LR', - help='initial learning rate') - parser.add_argument( - "-b", "--batch_size", default=64, type=int, help="batch size") - parser.add_argument( - "-n", "--num_devices", default=8, type=int, help="number of devices") - parser.add_argument( - "-p", "--pretrain_weights", default=None, type=str, - help="path to pretrained weights") - parser.add_argument( - "-w", "--weights", default=None, type=str, - help="path to model weights") - FLAGS = parser.parse_args() - assert FLAGS.data, "error: must provide data path" - main() diff --git a/yolov3/README.md b/yolov3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..88e153509c4e14c21c43dcc1d69fece7998616e1 --- /dev/null +++ b/yolov3/README.md @@ -0,0 +1,203 @@ +# YOLOv3 目标检测模型 + +--- + +## 内容 + +- [模型简介](#模型简介) +- [快速开始](#快速开始) +- [参考论文](#参考论文) + + +## 模型简介 + +[YOLOv3](https://arxiv.org/abs/1804.02767) 是由 [Joseph Redmon](https://arxiv.org/search/cs?searchtype=author&query=Redmon%2C+J) 和 [Ali Farhadi](https://arxiv.org/search/cs?searchtype=author&query=Farhadi%2C+A) 提出的单阶段检测器, 该检测器与达到同样精度的传统目标检测方法相比,推断速度能达到接近两倍. + +传统目标检测方法通过两阶段检测,第一阶段生成预选框,第二阶段对预选框进行分类和位置坐标的调整,而YOLO将目标检测看做是对框位置和类别概率的一个单阶段回归问题,使得YOLO能达到近两倍的检测速度。而YOLOv3在YOLO的基础上引入的多尺度预测,使得YOLOv3网络对于小物体的检测精度大幅提高。 + +[YOLOv3](https://arxiv.org/abs/1804.02767) 是一阶段End2End的目标检测器。其目标检测原理如下图所示: +

+
+YOLOv3检测原理 +

+ +YOLOv3将输入图像分成S\*S个格子,每个格子预测B个bounding box,每个bounding box预测内容包括: Location(x, y, w, h)、Confidence Score和C个类别的概率,因此YOLOv3输出层的channel数为B\*(5 + C)。YOLOv3的loss函数也有三部分组成:Location误差,Confidence误差和分类误差。 + +YOLOv3的网络结构如下图所示: +

+
+YOLOv3网络结构 +

+ +YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层和输出层组成。 + +1. 特征提取网络。YOLOv3使用 [DarkNet53](https://arxiv.org/abs/1612.08242)作为特征提取网络:DarkNet53 基本采用了全卷积网络,用步长为2的卷积操作替代了池化层,同时添加了 Residual 单元,避免在网络层数过深时发生梯度弥散。 + +2. 特征融合层。为了解决之前YOLO版本对小目标不敏感的问题,YOLOv3采用了3个不同尺度的特征图来进行目标检测,分别为13\*13,26\*26,52\*52,用来检测大、中、小三种目标。特征融合层选取 DarkNet 产出的三种尺度特征图作为输入,借鉴了FPN(feature pyramid networks)的思想,通过一系列的卷积层和上采样对各尺度的特征图进行融合。 + +3. 输出层。同样使用了全卷积结构,其中最后一个卷积层的卷积核个数是255:3\*(80+4+1)=255,3表示一个grid cell包含3个bounding box,4表示框的4个坐标信息,1表示Confidence Score,80表示COCO数据集中80个类别的概率。 + + +## 快速开始 + +### 安装说明 + +#### paddle安装 + + 本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装 + +#### 代码下载及环境变量设置 + + 克隆代码库到本地,并设置`PYTHONPATH`环境变量 + + ```bash + git clone https://github.com/PaddlePaddle/hapi + cd hapi + export PYTHONPATH=$PYTHONPATH:`pwd` + cd tsm + ``` + +#### 安装COCO-API + + 训练前需要首先下载[COCO-API](https://github.com/cocodataset/cocoapi): + + ```bash + git clone https://github.com/cocodataset/cocoapi.git + cd cocoapi/PythonAPI + # if cython is not installed + pip install Cython + # Install into global site-packages + make install + # Alternatively, if you do not have permissions or prefer + # not to install the COCO API into global site-packages + python setup.py install --user + ``` + +### 数据准备 + +模型目前支持COCO数据集格式的数据读入和精度评估,我们同时提供了将转换为COCO数据集的格式的Pascal VOC数据集下载,可通过如下命令下载。 + + ```bash + python dataset/download_voc.py + ``` + +数据目录结构如下: + + ``` + dataset/voc/ + ├── annotations + │   ├── instances_train2017.json + │   ├── instances_val2017.json + | ... + ├── train2017 + │   ├── 1013.jpg + │   ├── 1014.jpg + | ... + ├── val2017 + │   ├── 2551.jpg + │   ├── 2552.jpg + | ... + ``` + +### 模型训练 + +数据准备完毕后,可使用`main.py`脚本启动训练和评估,如下脚本会自动每epoch交替进行训练和模型评估,并将checkpoint默认保存在`yolo_checkpoint`目录下。 + +YOLOv3模型训练总batch_size为64训练,以下以使用4卡Tesla P40每卡batch_size为16训练介绍训练方式。对于静态图和动态图,多卡训练中`--batch_size`为每卡上的batch_size,即总batch_size为`--batch_size`乘以卡数。 + + +`main.py`脚本参数可通过如下命令查询 + +```bash +python main.py --help +``` + +#### 静态图训练 + +使用如下方式进行多卡训练: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --data= --batch_size=16 +``` + +#### 动态图训练 + +动态图训练只需要在运行脚本时添加`-d`参数即可。 + +使用如下方式进行多卡训练: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --data= --batch_size=16 -d +``` + + +### 模型评估 + +YOLOv3模型输出为LoDTensor,只支持使用batch_size为1进行评估,可通过如下两种方式进行模型评估。 + +1. 自动下载Paddle发布的[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams)权重评估 + +```bash +python main.py --data=dataset/voc --eval_only +``` + +2. 加载checkpoint进行精度评估 + +```bash +python main.py --data=dataset/voc --eval_only --weights=yolo_checkpoint/no_mixup/final +``` + +同样可以通过指定`-d`参数进行动态图模式的评估。 + +#### 评估精度 + +在10类小数据集下训练模型权重见[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams),评估精度如下: + +```bash +Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.503 +Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.779 +Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.562 +Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.190 +Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.390 +Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.578 +Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.405 +Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.591 +Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.599 +Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.294 +Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.506 +Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.670 +``` + +### 模型推断及可视化 + +可通过如下两种方式进行模型推断。 + +1. 自动下载Paddle发布的[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams)权重评估 + +```bash +python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg +``` + +2. 加载checkpoint进行精度评估 + +```bash +python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg --weights=yolo_checkpoint/mo_mixup/final +``` + +推断结果可视化图像会保存于`--output`指定的文件夹下,默认保存于`./output`目录。 + +模型推断会输出如下检测结果日志: + +```text +2020-04-02 08:26:47,268-INFO: detect bicycle at [116.14993, 127.278336, 579.7716, 438.44214] score: 0.97 +2020-04-02 08:26:47,273-INFO: detect dog at [127.44086, 215.71997, 316.04276, 539.7584] score: 0.99 +2020-04-02 08:26:47,274-INFO: detect car at [475.42343, 80.007484, 687.16095, 171.27374] score: 0.98 +2020-04-02 08:26:47,274-INFO: Detection bbox results save in output/dog.jpg +``` + +## 参考论文 + +- [You Only Look Once: Unified, Real-Time Object Detection](https://arxiv.org/abs/1506.02640v5), Joseph Redmon, Santosh Divvala, Ross Girshick, Ali Farhadi. +- [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767v1), Joseph Redmon, Ali Farhadi. +- [Bag of Freebies for Training Object Detection Neural Networks](https://arxiv.org/abs/1902.04103v3), Zhi Zhang, Tong He, Hang Zhang, Zhongyue Zhang, Junyuan Xie, Mu Li. + diff --git a/yolov3/__init__.py b/yolov3/__init__.py deleted file mode 100644 index 9118340d83fefa17d4a7e8fc577ee22a2d3a2656..0000000000000000000000000000000000000000 --- a/yolov3/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/yolov3/coco.py b/yolov3/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..34809246c1f90d3ad029842c19ae5f2c3eba08b0 --- /dev/null +++ b/yolov3/coco.py @@ -0,0 +1,275 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import os +import cv2 +import numpy as np +from pycocotools.coco import COCO + +from paddle.fluid.io import Dataset + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['COCODataset'] + + +class COCODataset(Dataset): + """ + Load dataset with MS-COCO format. + + Args: + dataset_dir (str): root directory for dataset. + image_dir (str): directory for images. + anno_path (str): voc annotation file path. + sample_num (int): number of samples to load, -1 means all. + use_default_label (bool): whether use the default mapping of + label to integer index. Default True. + with_background (bool): whether load background as a class, + default True. + transform (callable): callable transform to perform on samples, + default None. + mixup (bool): whether return image mixup samples, default False. + alpha (float): alpha factor of beta distribution to generate + mixup score, used only when mixup is True, default 1.5 + beta (float): beta factor of beta distribution to generate + mixup score, used only when mixup is True, default 1.5 + """ + + def __init__(self, + dataset_dir='', + image_dir='', + anno_path='', + sample_num=-1, + with_background=True, + transform=None, + mixup=False, + alpha=1.5, + beta=1.5): + # roidbs is list of dict whose structure is: + # { + # 'im_file': im_fname, # image file name + # 'im_id': im_id, # image id + # 'h': im_h, # height of image + # 'w': im_w, # width + # 'is_crowd': is_crowd, + # 'gt_class': gt_class, + # 'gt_bbox': gt_bbox, + # 'gt_score': gt_score, + # 'difficult': difficult + # } + + self._anno_path = os.path.join(dataset_dir, anno_path) + self._image_dir = os.path.join(dataset_dir, image_dir) + assert os.path.exists(self._anno_path), \ + "anno_path {} not exists".format(anno_path) + assert os.path.exists(self._image_dir), \ + "image_dir {} not exists".format(image_dir) + + self._sample_num = sample_num + self._with_background = with_background + self._transform = transform + self._mixup = mixup + self._alpha = alpha + self._beta = beta + + # load in dataset roidbs + self._load_roidb_and_cname2cid() + + def _load_roidb_and_cname2cid(self): + assert self._anno_path.endswith('.json'), \ + 'invalid coco annotation file: ' + anno_path + coco = COCO(self._anno_path) + img_ids = coco.getImgIds() + cat_ids = coco.getCatIds() + records = [] + ct = 0 + + # when with_background = True, mapping category to classid, like: + # background:0, first_class:1, second_class:2, ... + catid2clsid = dict({ + catid: i + int(self._with_background) + for i, catid in enumerate(cat_ids) + }) + cname2cid = dict({ + coco.loadCats(catid)[0]['name']: clsid + for catid, clsid in catid2clsid.items() + }) + + for img_id in img_ids: + img_anno = coco.loadImgs(img_id)[0] + im_fname = img_anno['file_name'] + im_w = float(img_anno['width']) + im_h = float(img_anno['height']) + + ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) + instances = coco.loadAnns(ins_anno_ids) + + bboxes = [] + for inst in instances: + x, y, box_w, box_h = inst['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(im_w - 1, x1 + max(0, box_w - 1)) + y2 = min(im_h - 1, y1 + max(0, box_h - 1)) + if inst['area'] > 0 and x2 >= x1 and y2 >= y1: + inst['clean_bbox'] = [x1, y1, x2, y2] + bboxes.append(inst) + else: + logger.warn( + 'Found an invalid bbox in annotations: im_id: {}, ' + 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( + img_id, float(inst['area']), x1, y1, x2, y2)) + num_bbox = len(bboxes) + + gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) + gt_class = np.zeros((num_bbox, 1), dtype=np.int32) + gt_score = np.ones((num_bbox, 1), dtype=np.float32) + is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) + difficult = np.zeros((num_bbox, 1), dtype=np.int32) + gt_poly = [None] * num_bbox + + for i, box in enumerate(bboxes): + catid = box['category_id'] + gt_class[i][0] = catid2clsid[catid] + gt_bbox[i, :] = box['clean_bbox'] + is_crowd[i][0] = box['iscrowd'] + if 'segmentation' in box: + gt_poly[i] = box['segmentation'] + + im_fname = os.path.join(self._image_dir, + im_fname) if self._image_dir else im_fname + coco_rec = { + 'im_file': im_fname, + 'im_id': np.array([img_id]), + 'h': im_h, + 'w': im_w, + 'is_crowd': is_crowd, + 'gt_class': gt_class, + 'gt_bbox': gt_bbox, + 'gt_score': gt_score, + 'gt_poly': gt_poly, + } + + records.append(coco_rec) + ct += 1 + if self._sample_num > 0 and ct >= self._sample_num: + break + assert len(records) > 0, 'not found any coco record in %s' % (self._anno_path) + logger.info('{} samples in file {}'.format(ct, self._anno_path)) + self._roidbs, self._cname2cid = records, cname2cid + + @property + def num_classes(self): + return len(self._cname2cid) + + def __len__(self): + return len(self._roidbs) + + def _getitem_by_index(self, idx): + roidb = self._roidbs[idx] + with open(roidb['im_file'], 'rb') as f: + data = np.frombuffer(f.read(), dtype='uint8') + im = cv2.imdecode(data, 1) + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + im_info = np.array([roidb['im_id'][0], roidb['h'], roidb['w']], dtype='int32') + gt_bbox = roidb['gt_bbox'] + gt_class = roidb['gt_class'] + gt_score = roidb['gt_score'] + return im_info, im, gt_bbox, gt_class, gt_score + + def __getitem__(self, idx): + im_info, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(idx) + + if self._mixup: + mixup_idx = idx + np.random.randint(1, self.__len__()) + mixup_idx %= self.__len__() + _, mixup_im, mixup_bbox, mixup_class, _ = \ + self._getitem_by_index(mixup_idx) + + im, gt_bbox, gt_class, gt_score = \ + self._mixup_image(im, gt_bbox, gt_class, mixup_im, + mixup_bbox, mixup_class) + + if self._transform: + im_info, im, gt_bbox, gt_class, gt_score = \ + self._transform(im_info, im, gt_bbox, gt_class, gt_score) + + return [im_info, im, gt_bbox, gt_class, gt_score] + + def _mixup_image(self, img1, bbox1, class1, img2, bbox2, class2): + factor = np.random.beta(self._alpha, self._beta) + factor = max(0.0, min(1.0, factor)) + if factor >= 1.0: + return img1, bbox1, class1, np.ones_like(class1, dtype="float32") + if factor <= 0.0: + return img2, bbox2, class2, np.ones_like(class2, dtype="float32") + + h = max(img1.shape[0], img2.shape[0]) + w = max(img1.shape[1], img2.shape[1]) + img = np.zeros((h, w, img1.shape[2]), 'float32') + img[:img1.shape[0], :img1.shape[1], :] = \ + img1.astype('float32') * factor + img[:img2.shape[0], :img2.shape[1], :] += \ + img2.astype('float32') * (1.0 - factor) + + gt_bbox = np.concatenate((bbox1, bbox2), axis=0) + gt_class = np.concatenate((class1, class2), axis=0) + + score1 = np.ones_like(class1, dtype="float32") * factor + score2 = np.ones_like(class2, dtype="float32") * (1.0 - factor) + gt_score = np.concatenate((score1, score2), axis=0) + + return img, gt_bbox, gt_class, gt_score + + @property + def mixup(self): + return self._mixup + + @mixup.setter + def mixup(self, value): + if not isinstance(value, bool): + raise ValueError("mixup should be a boolean number") + logger.info("{} set mixup to {}".format(self, value)) + self._mixup = value + +def pascalvoc_label(with_background=True): + labels_map = { + 'aeroplane': 1, + 'bicycle': 2, + 'bird': 3, + 'boat': 4, + 'bottle': 5, + 'bus': 6, + 'car': 7, + 'cat': 8, + 'chair': 9, + 'cow': 10, + 'diningtable': 11, + 'dog': 12, + 'horse': 13, + 'motorbike': 14, + 'person': 15, + 'pottedplant': 16, + 'sheep': 17, + 'sofa': 18, + 'train': 19, + 'tvmonitor': 20 + } + if not with_background: + labels_map = {k: v - 1 for k, v in labels_map.items()} + return labels_map diff --git a/yolov3/coco_metric.py b/yolov3/coco_metric.py index ec7bcac24b3dde91d3ae85e39e7bf9e5151f43ec..2f2f9825b1f90c08afa7b6089641d5a4b28be51d 100644 --- a/yolov3/coco_metric.py +++ b/yolov3/coco_metric.py @@ -17,8 +17,6 @@ import json from pycocotools.cocoeval import COCOeval from pycocotools.coco import COCO -from metrics import Metric - import logging FORMAT = '%(asctime)s-%(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -26,12 +24,13 @@ logger = logging.getLogger(__name__) __all__ = ['COCOMetric'] - OUTFILE = './bbox.json' -# considered to change to a callback later -class COCOMetric(Metric): +# COCOMetric behavior is different from Metric defined in high +# level API, COCOMetric will and con only accumulate on the epoch +# end, so we impliment COCOMetric as not a high level API Metric +class COCOMetric(): """ Metrci for MS-COCO dataset, only support update with batch size as 1. @@ -43,26 +42,24 @@ class COCOMetric(Metric): """ def __init__(self, anno_path, with_background=True, **kwargs): - super(COCOMetric, self).__init__(**kwargs) self.anno_path = anno_path self.with_background = with_background self.bbox_results = [] self.coco_gt = COCO(anno_path) cat_ids = self.coco_gt.getCatIds() - self.clsid2catid = dict( - {i + int(with_background): catid - for i, catid in enumerate(cat_ids)}) + self.clsid2catid = dict( + {i + int(with_background): catid + for i, catid in enumerate(cat_ids)}) - def update(self, preds, *args, **kwargs): - im_ids, bboxes = preds - assert im_ids.shape[0] == 1, \ + def update(self, img_id, bboxes): + assert img_id.shape[0] == 1, \ "COCOMetric can only update with batch size = 1" if bboxes.shape[1] != 6: # no bbox detected in this batch return - im_id = int(im_ids) + img_id = int(img_id) for i in range(bboxes.shape[0]): dt = bboxes[i, :] clsid, score, xmin, ymin, xmax, ymax = dt.tolist() @@ -72,7 +69,7 @@ class COCOMetric(Metric): h = ymax - ymin + 1 bbox = [xmin, ymin, w, h] coco_res = { - 'image_id': im_id, + 'image_id': img_id, 'category_id': catid, 'bbox': bbox, 'score': score @@ -83,30 +80,30 @@ class COCOMetric(Metric): self.bbox_results = [] def accumulate(self): - if len(self.bbox_results) == 0: - logger.warning("The number of valid bbox detected is zero.\n \ - Please use reasonable model and check input data.\n \ - stop COCOMetric accumulate!") - return [0.0] - with open(OUTFILE, 'w') as f: - json.dump(self.bbox_results, f) - - map_stats = self.cocoapi_eval(OUTFILE, 'bbox', coco_gt=self.coco_gt) - # flush coco evaluation result - sys.stdout.flush() + if len(self.bbox_results) == 0: + logger.warning("The number of valid bbox detected is zero.\n \ + Please use reasonable model and check input data.\n \ + stop COCOMetric accumulate!") + return [0.0] + with open(OUTFILE, 'w') as f: + json.dump(self.bbox_results, f) + + map_stats = self.cocoapi_eval(OUTFILE, 'bbox', coco_gt=self.coco_gt) + # flush coco evaluation result + sys.stdout.flush() self.result = map_stats[0] - return self.result + return [self.result] def cocoapi_eval(self, jsonfile, style, coco_gt=None, anno_file=None): - assert coco_gt != None or anno_file != None - - if coco_gt == None: - coco_gt = COCO(anno_file) - logger.info("Start evaluate...") - coco_dt = coco_gt.loadRes(jsonfile) - coco_eval = COCOeval(coco_gt, coco_dt, style) - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - return coco_eval.stats + assert coco_gt != None or anno_file != None + + if coco_gt == None: + coco_gt = COCO(anno_file) + logger.info("Start evaluate...") + coco_dt = coco_gt.loadRes(jsonfile) + coco_eval = COCOeval(coco_gt, coco_dt, style) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval.stats diff --git a/yolov3/dataset/download_voc.py b/yolov3/dataset/download_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..0d4e3cf368ca446f989f19449adf2775d741fe7f --- /dev/null +++ b/yolov3/dataset/download_voc.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import os.path as osp +import sys +import tarfile + +from download import _download + +import logging +logger = logging.getLogger(__name__) + +DATASETS = { + 'voc': [ + ('https://paddlemodels.bj.bcebos.com/hapi/voc.tar', + '9faeb7fd997aeea843092fd608d5bcb4', ), + ], +} + +def download_decompress_file(data_dir, url, md5): + logger.info("Downloading from {}".format(url)) + tar_file = _download(url, data_dir, md5) + logger.info("Decompressing {}".format(tar_file)) + with tarfile.open(tar_file) as tf: + tf.extractall(path=data_dir) + os.remove(tar_file) + + +if __name__ == "__main__": + data_dir = osp.split(osp.realpath(sys.argv[0]))[0] + for name, infos in DATASETS.items(): + for info in infos: + download_decompress_file(data_dir, *info) + diff --git a/yolov3/image/YOLOv3.jpg b/yolov3/image/YOLOv3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..06b81f545247c1d542fd661f947eb0cf3edc480e Binary files /dev/null and b/yolov3/image/YOLOv3.jpg differ diff --git a/yolov3/image/YOLOv3_structure.jpg b/yolov3/image/YOLOv3_structure.jpg new file mode 100644 index 0000000000000000000000000000000000000000..51bd2d1733e2f78945d3e871cb5b649aad95d633 Binary files /dev/null and b/yolov3/image/YOLOv3_structure.jpg differ diff --git a/yolov3/image/dog.jpg b/yolov3/image/dog.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77b0381222eaed50867643f4166092c781e56d5b Binary files /dev/null and b/yolov3/image/dog.jpg differ diff --git a/yolov3/infer.py b/yolov3/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..f19e86615a0b1c8c57f3469f5a5bdcaa85535e9c --- /dev/null +++ b/yolov3/infer.py @@ -0,0 +1,126 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import os +import argparse +import numpy as np +from PIL import Image + +from paddle import fluid +from paddle.fluid.optimizer import Momentum +from paddle.fluid.io import DataLoader + +from model import Model, Input, set_device +from models import yolov3_darknet53, YoloLoss + +from coco import COCODataset +from transforms import * +from visualizer import draw_bbox + +import logging +logger = logging.getLogger(__name__) + +IMAGE_MEAN = [0.485, 0.456, 0.406] +IMAGE_STD = [0.229, 0.224, 0.225] + + +def get_save_image_name(output_dir, image_path): + """ + Get save image name from source image path. + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + image_name = os.path.split(image_path)[-1] + name, ext = os.path.splitext(image_name) + return os.path.join(output_dir, "{}".format(name)) + ext + + +def load_labels(label_list, with_background=True): + idx = int(with_background) + cat2name = {} + with open(label_list) as f: + for line in f.readlines(): + line = line.strip() + if line: + cat2name[idx] = line + idx += 1 + return cat2name + + +def main(): + device = set_device(FLAGS.device) + fluid.enable_dygraph(device) if FLAGS.dynamic else None + + inputs = [Input([None, 3], 'int32', name='img_info'), + Input([None, 3, None, None], 'float32', name='image')] + + cat2name = load_labels(FLAGS.label_list, with_background=False) + + model = yolov3_darknet53(num_classes=len(cat2name), + model_mode='test', + pretrained=FLAGS.weights is None) + + model.prepare(inputs=inputs, device=FLAGS.device) + + if FLAGS.weights is not None: + model.load(FLAGS.weights, reset_optimizer=True) + + # image preprocess + orig_img = Image.open(FLAGS.infer_image).convert('RGB') + w, h = orig_img.size + img = orig_img.resize((608, 608), Image.BICUBIC) + img = np.array(img).astype('float32') / 255.0 + img -= np.array(IMAGE_MEAN) + img /= np.array(IMAGE_STD) + img = img.transpose((2, 0, 1))[np.newaxis, :] + img_info = np.array([0, h, w]).astype('int32')[np.newaxis, :] + + _, bboxes = model.test([img_info, img]) + + vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold) + save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image) + logger.info("Detection bbox results save in {}".format(save_name)) + vis_img.save(save_name, quality=95) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Yolov3 Training on VOC") + parser.add_argument( + "--device", type=str, default='gpu', help="device to use, gpu or cpu") + parser.add_argument( + "-d", "--dynamic", action='store_true', help="enable dygraph mode") + parser.add_argument( + "--label_list", type=str, default=None, + help="path to category label list file") + parser.add_argument( + "-t", "--draw_threshold", type=float, default=0.5, + help="threshold to reserve the result for visualization") + parser.add_argument( + "-i", "--infer_image", type=str, default=None, + help="image path for inference") + parser.add_argument( + "-o", "--output_dir", type=str, default='output', + help="directory to save inference result if --visualize is set") + parser.add_argument( + "-w", "--weights", default=None, type=str, + help="path to weights for inference") + FLAGS = parser.parse_args() + assert os.path.isfile(FLAGS.infer_image), \ + "infer_image {} not a file".format(FLAGS.infer_image) + assert os.path.isfile(FLAGS.label_list), \ + "label_list {} not a file".format(FLAGS.label_list) + main() diff --git a/yolov3/main.py b/yolov3/main.py new file mode 100644 index 0000000000000000000000000000000000000000..18c24d196877586475f6aba1f949c3207665fcce --- /dev/null +++ b/yolov3/main.py @@ -0,0 +1,208 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import argparse +import contextlib +import os + +import numpy as np + +from paddle import fluid +from paddle.fluid.optimizer import Momentum +from paddle.fluid.io import DataLoader + +from model import Model, Input, set_device +from distributed import DistributedBatchSampler +from models import yolov3_darknet53, YoloLoss + +from coco_metric import COCOMetric +from coco import COCODataset +from transforms import * + +NUM_MAX_BOXES = 50 + + +def make_optimizer(step_per_epoch, parameter_list=None): + base_lr = FLAGS.lr + warm_up_iter = 1000 + momentum = 0.9 + weight_decay = 5e-4 + boundaries = [step_per_epoch * e for e in [200, 250]] + values = [base_lr * (0.1 ** i) for i in range(len(boundaries) + 1)] + learning_rate = fluid.layers.piecewise_decay( + boundaries=boundaries, + values=values) + learning_rate = fluid.layers.linear_lr_warmup( + learning_rate=learning_rate, + warmup_steps=warm_up_iter, + start_lr=0.0, + end_lr=base_lr) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2Decay(weight_decay), + momentum=momentum, + parameter_list=parameter_list) + return optimizer + + +def main(): + device = set_device(FLAGS.device) + fluid.enable_dygraph(device) if FLAGS.dynamic else None + + inputs = [Input([None, 3], 'int32', name='img_info'), + Input([None, 3, None, None], 'float32', name='image')] + labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'), + Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'), + Input([None, NUM_MAX_BOXES], 'float32', name='gt_score')] + + if not FLAGS.eval_only: # training mode + train_transform = Compose([ColorDistort(), + RandomExpand(), + RandomCrop(), + RandomFlip(), + NormalizeBox(), + PadBox(), + BboxXYXY2XYWH()]) + train_collate_fn = BatchCompose([RandomShape(), NormalizeImage()]) + dataset = COCODataset(dataset_dir=FLAGS.data, + anno_path='annotations/instances_train2017.json', + image_dir='train2017', + with_background=False, + mixup=True, + transform=train_transform) + batch_sampler = DistributedBatchSampler(dataset, + batch_size=FLAGS.batch_size, + shuffle=True, + drop_last=True) + loader = DataLoader(dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=train_collate_fn) + else: # evaluation mode + eval_transform = Compose([ResizeImage(target_size=608), + NormalizeBox(), + PadBox(), + BboxXYXY2XYWH()]) + eval_collate_fn = BatchCompose([NormalizeImage()]) + dataset = COCODataset(dataset_dir=FLAGS.data, + anno_path='annotations/instances_val2017.json', + image_dir='val2017', + with_background=False, + transform=eval_transform) + # batch_size can only be 1 in evaluation for YOLOv3 + # prediction bbox is a LoDTensor + batch_sampler = DistributedBatchSampler(dataset, + batch_size=1, + shuffle=False, + drop_last=False) + loader = DataLoader(dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=FLAGS.num_workers, + return_list=True, + collate_fn=eval_collate_fn) + + pretrained = FLAGS.eval_only and FLAGS.weights is None + model = yolov3_darknet53(num_classes=dataset.num_classes, + model_mode='eval' if FLAGS.eval_only else 'train', + pretrained=pretrained) + + if FLAGS.pretrain_weights is not None: + model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True) + + optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters()) + + model.prepare(optim, + YoloLoss(num_classes=dataset.num_classes), + inputs=inputs, labels=labels, + device=FLAGS.device) + + # NOTE: we implement COCO metric of YOLOv3 model here, separately + # from 'prepare' and 'fit' framework for follwing reason: + # 1. YOLOv3 network structure is different between 'train' and + # 'eval' mode, in 'eval' mode, output prediction bbox is not the + # feature map used for YoloLoss calculating + # 2. COCO metric behavior is also different from defined Metric + # for COCO metric should not perform accumulate in each iteration + # but only accumulate at the end of an epoch + if FLAGS.eval_only: + if FLAGS.weights is not None: + model.load(FLAGS.weights, reset_optimizer=True) + preds = model.predict(loader, stack_outputs=False) + _, _, _, img_ids, bboxes = preds + + anno_path = os.path.join(FLAGS.data, 'annotations/instances_val2017.json') + coco_metric = COCOMetric(anno_path=anno_path, with_background=False) + for img_id, bbox in zip(img_ids, bboxes): + coco_metric.update(img_id, bbox) + coco_metric.accumulate() + coco_metric.reset() + return + + if FLAGS.resume is not None: + model.load(FLAGS.resume) + + model.fit(train_data=loader, + epochs=FLAGS.epoch - FLAGS.no_mixup_epoch, + save_dir="yolo_checkpoint/mixup", + save_freq=10) + + # do not use image mixup transfrom in laste FLAGS.no_mixup_epoch epoches + dataset.mixup = False + model.fit(train_data=loader, + epochs=FLAGS.no_mixup_epoch, + save_dir="yolo_checkpoint/no_mixup", + save_freq=5) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Yolov3 Training on VOC") + parser.add_argument( + "--data", type=str, default='dataset/voc', + help="path to dataset directory") + parser.add_argument( + "--device", type=str, default='gpu', help="device to use, gpu or cpu") + parser.add_argument( + "-d", "--dynamic", action='store_true', help="enable dygraph mode") + parser.add_argument( + "--eval_only", action='store_true', help="run evaluation only") + parser.add_argument( + "-e", "--epoch", default=300, type=int, help="number of epoch") + parser.add_argument( + "--no_mixup_epoch", default=30, type=int, + help="number of the last N epoch without image mixup") + parser.add_argument( + '--lr', '--learning-rate', default=0.001, type=float, metavar='LR', + help='initial learning rate') + parser.add_argument( + "-b", "--batch_size", default=8, type=int, help="batch size") + parser.add_argument( + "-j", "--num_workers", default=4, type=int, help="reader worker number") + parser.add_argument( + "-p", "--pretrain_weights", default=None, type=str, + help="path to pretrained weights") + parser.add_argument( + "-r", "--resume", default=None, type=str, + help="path to model weights") + parser.add_argument( + "-w", "--weights", default=None, type=str, + help="path to weights for evaluation") + FLAGS = parser.parse_args() + assert FLAGS.data, "error: must provide data path" + main() diff --git a/yolov3/transforms.py b/yolov3/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a5fbe46cbbfdb39efe3025a351b407b82dbf33c4 --- /dev/null +++ b/yolov3/transforms.py @@ -0,0 +1,620 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import cv2 +import traceback +import numpy as np + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['ColorDistort', 'RandomExpand', 'RandomCrop', 'RandomFlip', + 'NormalizeBox', 'PadBox', 'RandomShape', 'NormalizeImage', + 'BboxXYXY2XYWH', 'ResizeImage', 'Compose', 'BatchCompose'] + + +class Compose(object): + def __init__(self, transforms=[]): + self.transforms = transforms + + def __call__(self, *data): + for f in self.transforms: + try: + data = f(*data) + except Exception as e: + stack_info = traceback.format_exc() + logger.info("fail to perform transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + return data + + +class BatchCompose(object): + def __init__(self, transforms=[]): + self.transforms = transforms + + def __call__(self, data): + for f in self.transforms: + try: + data = f(data) + except Exception as e: + stack_info = traceback.format_exc() + logger.info("fail to perform batch transform [{}] with error: " + "{} and stack:\n{}".format(f, e, str(stack_info))) + raise e + + # sample list to batch data + batch = list(zip(*data)) + + return batch + + +class ColorDistort(object): + """Random color distortion. + + Args: + hue (list): hue settings. + in [lower, upper, probability] format. + saturation (list): saturation settings. + in [lower, upper, probability] format. + contrast (list): contrast settings. + in [lower, upper, probability] format. + brightness (list): brightness settings. + in [lower, upper, probability] format. + random_apply (bool): whether to apply in random (yolo) or fixed (SSD) + order. + """ + + def __init__(self, + hue=[-18, 18, 0.5], + saturation=[0.5, 1.5, 0.5], + contrast=[0.5, 1.5, 0.5], + brightness=[0.5, 1.5, 0.5], + random_apply=True): + self.hue = hue + self.saturation = saturation + self.contrast = contrast + self.brightness = brightness + self.random_apply = random_apply + + def apply_hue(self, img): + low, high, prob = self.hue + if np.random.uniform(0., 1.) < prob: + return img + + img = img.astype(np.float32) + + # XXX works, but result differ from HSV version + delta = np.random.uniform(low, high) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321], + [0.211, -0.523, 0.311]]) + ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647], + [1.0, -1.107, 1.705]]) + t = np.dot(np.dot(ityiq, bt), tyiq).T + img = np.dot(img, t) + return img + + def apply_saturation(self, img): + low, high, prob = self.saturation + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + + img = img.astype(np.float32) + gray = img * np.array([[[0.299, 0.587, 0.114]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + return img + + def apply_contrast(self, img): + low, high, prob = self.contrast + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + + img = img.astype(np.float32) + img *= delta + return img + + def apply_brightness(self, img): + low, high, prob = self.brightness + if np.random.uniform(0., 1.) < prob: + return img + delta = np.random.uniform(low, high) + + img = img.astype(np.float32) + img += delta + return img + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + if self.random_apply: + distortions = np.random.permutation([ + self.apply_brightness, self.apply_contrast, + self.apply_saturation, self.apply_hue + ]) + for func in distortions: + im = func(im) + return [im_info, im, gt_bbox, gt_class, gt_score] + + im = self.apply_brightness(im) + + if np.random.randint(0, 2): + im = self.apply_contrast(im) + im = self.apply_saturation(im) + im = self.apply_hue(im) + else: + im = self.apply_saturation(im) + im = self.apply_hue(im) + im = self.apply_contrast(im) + return [im_info, im, gt_bbox, gt_class, gt_score] + + +class RandomExpand(object): + """Random expand the canvas. + + Args: + ratio (float): maximum expansion ratio. + prob (float): probability to expand. + fill_value (list): color value used to fill the canvas. in RGB order. + """ + + def __init__(self, ratio=4., prob=0.5, fill_value=[123.675, 116.28, 103.53]): + assert ratio > 1.01, "expand ratio must be larger than 1.01" + self.ratio = ratio + self.prob = prob + self.fill_value = fill_value + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + if np.random.uniform(0., 1.) < self.prob: + return [im_info, im, gt_bbox, gt_class, gt_score] + + height, width, _ = im.shape + expand_ratio = np.random.uniform(1., self.ratio) + h = int(height * expand_ratio) + w = int(width * expand_ratio) + if not h > height or not w > width: + return [im_info, im, gt_bbox, gt_class, gt_score] + y = np.random.randint(0, h - height) + x = np.random.randint(0, w - width) + canvas = np.ones((h, w, 3), dtype=np.uint8) + canvas *= np.array(self.fill_value, dtype=np.uint8) + canvas[y:y + height, x:x + width, :] = im.astype(np.uint8) + + gt_bbox += np.array([x, y, x, y], dtype=np.float32) + + return [im_info, canvas, gt_bbox, gt_class, gt_score] + + +class RandomCrop(): + """Random crop image and bboxes. + + Args: + aspect_ratio (list): aspect ratio of cropped region. + in [min, max] format. + thresholds (list): iou thresholds for decide a valid bbox crop. + scaling (list): ratio between a cropped region and the original image. + in [min, max] format. + num_attempts (int): number of tries before giving up. + allow_no_crop (bool): allow return without actually cropping them. + cover_all_box (bool): ensure all bboxes are covered in the final crop. + """ + + def __init__(self, + aspect_ratio=[.5, 2.], + thresholds=[.0, .1, .3, .5, .7, .9], + scaling=[.3, 1.], + num_attempts=50, + allow_no_crop=True, + cover_all_box=False): + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + if len(gt_bbox) == 0: + return [im_info, im, gt_bbox, gt_class, gt_score] + + # NOTE Original method attempts to generate one candidate for each + # threshold then randomly sample one from the resulting list. + # Here a short circuit approach is taken, i.e., randomly choose a + # threshold and attempt to find a valid crop, and simply return the + # first one found. + # The probability is not exactly the same, kinda resembling the + # "Monty Hall" problem. Actually carrying out the attempts will affect + # observability (just like opening doors in the "Monty Hall" game). + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + np.random.shuffle(thresholds) + + for thresh in thresholds: + if thresh == 'no_crop': + return [im_info, im, gt_bbox, gt_class, gt_score] + + h, w, _ = im.shape + found = False + for i in range(self.num_attempts): + scale = np.random.uniform(*self.scaling) + min_ar, max_ar = self.aspect_ratio + aspect_ratio = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + crop_h = int(h * scale / np.sqrt(aspect_ratio)) + crop_w = int(w * scale * np.sqrt(aspect_ratio)) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + iou = self._iou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + if self.cover_all_box and iou.min() < thresh: + continue + + cropped_box, valid_ids = self._crop_box_with_center_constraint( + gt_bbox, np.array( + crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + im = self._crop_image(im, crop_box) + gt_bbox = np.take(cropped_box, valid_ids, axis=0) + gt_class = np.take(gt_class, valid_ids, axis=0) + gt_score = np.take(gt_score, valid_ids, axis=0) + return [im_info, im, gt_bbox, gt_class, gt_score] + + return [im_info, im, gt_bbox, gt_class, gt_score] + + def _iou_matrix(self, a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + def _crop_box_with_center_constraint(self, box, crop): + cropped_box = box.copy() + + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and(crop[:2] <= centers, + centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return cropped_box, np.where(valid)[0] + + def _crop_image(self, img, crop): + x1, y1, x2, y2 = crop + return img[y1:y2, x1:x2, :] + + +class RandomFlip(): + def __init__(self, prob=0.5, is_normalized=False): + """ + Args: + prob (float): the probability of flipping image + is_normalized (bool): whether the bbox scale to [0,1] + """ + self.prob = prob + self.is_normalized = is_normalized + if not (isinstance(self.prob, float) and + isinstance(self.is_normalized, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + """Filp the image and bounding box. + Operators: + 1. Flip the image numpy. + 2. Transform the bboxes' x coordinates. + (Must judge whether the coordinates are normalized!) + """ + + if not isinstance(im, np.ndarray): + raise TypeError("{}: image is not a numpy array.".format(self)) + if len(im.shape) != 3: + raise ImageError("{}: image is not 3-dimensional.".format(self)) + height, width, _ = im.shape + if np.random.uniform(0, 1) < self.prob: + im = im[:, ::-1, :] + if gt_bbox.shape[0] > 0: + oldx1 = gt_bbox[:, 0].copy() + oldx2 = gt_bbox[:, 2].copy() + if self.is_normalized: + gt_bbox[:, 0] = 1 - oldx2 + gt_bbox[:, 2] = 1 - oldx1 + else: + gt_bbox[:, 0] = width - oldx2 - 1 + gt_bbox[:, 2] = width - oldx1 - 1 + if gt_bbox.shape[0] != 0 and ( + gt_bbox[:, 2] < gt_bbox[:, 0]).all(): + m = "{}: invalid box, x2 should be greater than x1".format( + self) + raise ValueError(m) + return [im_info, im, gt_bbox, gt_class, gt_score] + + +class NormalizeBox(object): + """Transform the bounding box's coornidates to [0,1].""" + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + height, width, _ = im.shape + for i in range(gt_bbox.shape[0]): + gt_bbox[i][0] = gt_bbox[i][0] / width + gt_bbox[i][1] = gt_bbox[i][1] / height + gt_bbox[i][2] = gt_bbox[i][2] / width + gt_bbox[i][3] = gt_bbox[i][3] / height + return [im_info, im, gt_bbox, gt_class, gt_score] + + +class PadBox(object): + def __init__(self, num_max_boxes=50): + """ + Pad zeros to bboxes if number of bboxes is less than num_max_boxes. + Args: + num_max_boxes (int): the max number of bboxes + """ + self.num_max_boxes = num_max_boxes + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + gt_num = min(self.num_max_boxes, len(gt_bbox)) + num_max = self.num_max_boxes + + pad_bbox = np.zeros((num_max, 4), dtype=np.float32) + if gt_num > 0: + pad_bbox[:gt_num, :] = gt_bbox[:gt_num, :] + gt_bbox = pad_bbox + + pad_class = np.zeros((num_max), dtype=np.int32) + if gt_num > 0: + pad_class[:gt_num] = gt_class[:gt_num, 0] + gt_class = pad_class + + pad_score = np.zeros((num_max), dtype=np.float32) + if gt_num > 0: + pad_score[:gt_num] = gt_score[:gt_num, 0] + gt_score = pad_score + return [im_info, im, gt_bbox, gt_class, gt_score] + + +class BboxXYXY2XYWH(object): + """ + Convert bbox XYXY format to XYWH format. + """ + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + gt_bbox[:, 2:4] = gt_bbox[:, 2:4] - gt_bbox[:, :2] + gt_bbox[:, :2] = gt_bbox[:, :2] + gt_bbox[:, 2:4] / 2. + return [im_info, im, gt_bbox, gt_class, gt_score] + + +class RandomShape(object): + """ + Randomly reshape a batch. If random_inter is True, also randomly + select one an interpolation algorithm [cv2.INTER_NEAREST, cv2.INTER_LINEAR, + cv2.INTER_AREA, cv2.INTER_CUBIC, cv2.INTER_LANCZOS4]. If random_inter is + False, use cv2.INTER_NEAREST. + + Args: + sizes (list): list of int, random choose a size from these + random_inter (bool): whether to randomly interpolation, defalut true. + """ + + def __init__(self, + sizes=[320, 352, 384, 416, 448, 480, 512, 544, 576, 608], + random_inter=True): + self.sizes = sizes + self.random_inter = random_inter + self.interps = [ + cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_AREA, + cv2.INTER_CUBIC, + cv2.INTER_LANCZOS4, + ] if random_inter else [] + + def __call__(self, samples): + shape = np.random.choice(self.sizes) + method = np.random.choice(self.interps) if self.random_inter \ + else cv2.INTER_NEAREST + for i in range(len(samples)): + im = samples[i][1] + h, w = im.shape[:2] + scale_x = float(shape) / w + scale_y = float(shape) / h + im = cv2.resize( + im, None, None, fx=scale_x, fy=scale_y, interpolation=method) + samples[i][1] = im + return samples + + +class NormalizeImage(object): + def __init__(self, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + scale=True, + channel_first=True): + """ + Args: + mean (list): the pixel mean + std (list): the pixel variance + scale (bool): whether scale image to [0, 1] + channel_first (bool): whehter change [h, w, c] to [c, h, w] + """ + self.mean = mean + self.std = std + self.scale = scale + self.channel_first = channel_first + if not (isinstance(self.mean, list) and isinstance(self.std, list) and + isinstance(self.scale, bool)): + raise TypeError("{}: input type is invalid.".format(self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def __call__(self, samples): + """Normalize the image. + Operators: + 1. (optional) Scale the image to [0,1] + 2. Each pixel minus mean and is divided by std + 3. (optional) permute channel + """ + for i in range(len(samples)): + im = samples[i][1] + im = im.astype(np.float32, copy=False) + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + if self.scale: + im = im / 255.0 + im -= mean + im /= std + if self.channel_first: + im = im.transpose((2, 0, 1)) + samples[i][1] = im + return samples + + +def _iou_matrix(a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + +def _crop_box_with_center_constraint(box, crop): + cropped_box = box.copy() + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and( + crop[:2] <= centers, centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + return cropped_box, np.where(valid)[0] + + +def random_crop(inputs): + aspect_ratios = [.5, 2.] + thresholds = [.0, .1, .3, .5, .7, .9] + scaling = [.3, 1.] + + img, img_ids, gt_box, gt_label = inputs + h, w = img.shape[:2] + + if len(gt_box) == 0: + return inputs + + np.random.shuffle(thresholds) + for thresh in thresholds: + found = False + for i in range(50): + scale = np.random.uniform(*scaling) + min_ar, max_ar = aspect_ratios + ar = np.random.uniform(max(min_ar, scale**2), + min(max_ar, scale**-2)) + crop_h = int(h * scale / np.sqrt(ar)) + crop_w = int(w * scale * np.sqrt(ar)) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] + iou = _iou_matrix(gt_box, np.array([crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + cropped_box, valid_ids = _crop_box_with_center_constraint( + gt_box, np.array(crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + x1, y1, x2, y2 = crop_box + img = img[y1:y2, x1:x2, :] + gt_box = np.take(cropped_box, valid_ids, axis=0) + gt_label = np.take(gt_label, valid_ids, axis=0) + return img, img_ids, gt_box, gt_label + + return inputs + + +class ResizeImage(object): + def __init__(self, + target_size=0, + interp=cv2.INTER_CUBIC): + """ + Rescale image to the specified target size. + If target_size is list, selected a scale randomly as the specified + target size. + + Args: + target_size (int|list): the target size of image's short side, + multi-scale training is adopted when type is list. + interp (int): the interpolation method + """ + self.interp = int(interp) + if not (isinstance(target_size, int) or isinstance(target_size, list)): + raise TypeError( + "Type of target_size is invalid. Must be Integer or List, now is {}". + format(type(target_size))) + self.target_size = target_size + + def __call__(self, im_info, im, gt_bbox, gt_class, gt_score): + """ Resize the image numpy. + """ + if not isinstance(im, np.ndarray): + raise TypeError("{}: image type is not numpy.".format(self)) + if len(im.shape) != 3: + raise ImageError('{}: image is not 3-dimensional.'.format(self)) + im_shape = im.shape + im_scale_x = float(self.target_size) / float(im_shape[1]) + im_scale_y = float(self.target_size) / float(im_shape[0]) + resize_w = self.target_size + resize_h = self.target_size + + im = cv2.resize( + im, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + + return [im_info, im, gt_bbox, gt_class, gt_score] + diff --git a/yolov3/visualizer.py b/yolov3/visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..4433df8606ec140fe08f197e445aea6df89bf445 --- /dev/null +++ b/yolov3/visualizer.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import numpy as np +from PIL import Image, ImageDraw + +import logging +logger = logging.getLogger(__name__) + +__all__ = ['draw_bbox'] + + +def color_map(num_classes): + color_map = num_classes * [0, 0, 0] + for i in range(0, num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = np.array(color_map).reshape(-1, 3) + return color_map + + +def draw_bbox(image, catid2name, bboxes, threshold): + """ + Draw bbox on image + """ + bboxes = np.array(bboxes) + if bboxes.shape[1] != 6: + logger.info("No bbox detect") + return image + + draw = ImageDraw.Draw(image) + + catid2color = {} + color_list = color_map(len(catid2name)) + for bbox in bboxes: + catid, score, xmin, ymin, xmax, ymax = bbox + + if score < threshold: + continue + + if catid not in catid2color: + idx = np.random.randint(len(color_list)) + catid2color[catid] = color_list[idx] + color = tuple(catid2color[catid]) + + # draw bbox + draw.line( + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), + (xmin, ymin)], + width=2, + fill=color) + logger.info("detect {} at {} score: {:.2f}".format( + catid2name[int(catid)], [xmin, ymin, xmax, ymax], score)) + + # draw label + text = "{} {:.2f}".format(catid2name[catid], score) + tw, th = draw.textsize(text) + draw.rectangle( + [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) + + return image