提交 e01d3c62 编写于 作者: Y Yibing Liu

Merge with the image-related guides

...@@ -185,6 +185,9 @@ class AsyncDataReader(object): ...@@ -185,6 +185,9 @@ class AsyncDataReader(object):
corresponding description file. corresponding description file.
drop_frame_len (int): Samples whose label length above the value will be drop_frame_len (int): Samples whose label length above the value will be
dropped.(Using '-1' to disable the policy) dropped.(Using '-1' to disable the policy)
split_sentence_threshold(int): Sentence whose length larger than
the value will trigger split operation.
(Assign -1 to disable split)
proc_num (int): Number of processes for processing data. proc_num (int): Number of processes for processing data.
sample_buffer_size (int): Buffer size to indicate the maximum samples sample_buffer_size (int): Buffer size to indicate the maximum samples
cached. cached.
...@@ -204,6 +207,7 @@ class AsyncDataReader(object): ...@@ -204,6 +207,7 @@ class AsyncDataReader(object):
feature_file_list, feature_file_list,
label_file_list="", label_file_list="",
drop_frame_len=512, drop_frame_len=512,
split_sentence_threshold=1024,
proc_num=10, proc_num=10,
sample_buffer_size=1024, sample_buffer_size=1024,
sample_info_buffer_size=1024, sample_info_buffer_size=1024,
...@@ -214,6 +218,7 @@ class AsyncDataReader(object): ...@@ -214,6 +218,7 @@ class AsyncDataReader(object):
self._feature_file_list = feature_file_list self._feature_file_list = feature_file_list
self._label_file_list = label_file_list self._label_file_list = label_file_list
self._drop_frame_len = drop_frame_len self._drop_frame_len = drop_frame_len
self._split_sentence_threshold = split_sentence_threshold
self._shuffle_block_num = shuffle_block_num self._shuffle_block_num = shuffle_block_num
self._block_info_list = None self._block_info_list = None
self._rng = random.Random(random_seed) self._rng = random.Random(random_seed)
...@@ -262,7 +267,8 @@ class AsyncDataReader(object): ...@@ -262,7 +267,8 @@ class AsyncDataReader(object):
map(lambda info: info[0], bucket_block_info), map(lambda info: info[0], bucket_block_info),
map(lambda info: info[1], bucket_block_info), map(lambda info: info[1], bucket_block_info),
map(lambda info: info[2], bucket_block_info), map(lambda info: info[2], bucket_block_info),
map(lambda info: info[3], bucket_block_info))) map(lambda info: info[3], bucket_block_info),
split_sentence_threshold=self._split_sentence_threshold))
# @TODO make this configurable # @TODO make this configurable
def set_transformers(self, transformers): def set_transformers(self, transformers):
......
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0
python -u ../../tools/profile.py --feature_lst data/train_feature.lst \ python -u ../../tools/profile.py --feature_lst data/train_feature.lst \
--label_lst data/train_label.lst \ --label_lst data/train_label.lst \
--mean_var data/aishell/global_mean_var \ --mean_var data/global_mean_var \
--parallel \
--frame_dim 80 \ --frame_dim 80 \
--class_num 3040 \ --class_num 3040 \
--batch_size 16
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=4,5,6,7
python -u ../../train.py --train_feature_lst data/train_feature.lst \ python -u ../../train.py --train_feature_lst data/train_feature.lst \
--train_label_lst data/train_label.lst \ --train_label_lst data/train_label.lst \
--val_feature_lst data/val_feature.lst \ --val_feature_lst data/val_feature.lst \
--val_label_lst data/val_label.lst \ --val_label_lst data/val_label.lst \
--mean_var data/aishell/global_mean_var \ --mean_var data/global_mean_var \
--checkpoints checkpoints \ --checkpoints checkpoints \
--frame_dim 80 \ --frame_dim 80 \
--class_num 3040 \ --class_num 3040 \
...@@ -11,4 +11,3 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \ ...@@ -11,4 +11,3 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \
--batch_size 64 \ --batch_size 64 \
--learning_rate 6.4e-5 \ --learning_rate 6.4e-5 \
--parallel --parallel
~
...@@ -162,7 +162,12 @@ def infer_from_ckpt(args): ...@@ -162,7 +162,12 @@ def infer_from_ckpt(args):
infer_program = fluid.default_main_program().clone() infer_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
......
...@@ -137,7 +137,12 @@ def profile(args): ...@@ -137,7 +137,12 @@ def profile(args):
class_num=args.class_num, class_num=args.class_num,
parallel=args.parallel) parallel=args.parallel)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
...@@ -150,7 +155,8 @@ def profile(args): ...@@ -150,7 +155,8 @@ def profile(args):
trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5) trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5)
] ]
data_reader = reader.AsyncDataReader(args.feature_lst, args.label_lst, -1) data_reader = reader.AsyncDataReader(
args.feature_lst, args.label_lst, -1, split_sentence_threshold=1024)
data_reader.set_transformers(ltrans) data_reader.set_transformers(ltrans)
feature_t = fluid.LoDTensor() feature_t = fluid.LoDTensor()
......
...@@ -159,7 +159,12 @@ def train(args): ...@@ -159,7 +159,12 @@ def train(args):
test_program = fluid.default_main_program().clone() test_program = fluid.default_main_program().clone()
#optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9) #optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
...@@ -186,8 +191,11 @@ def train(args): ...@@ -186,8 +191,11 @@ def train(args):
os.path.exists(args.val_label_lst)): os.path.exists(args.val_label_lst)):
return -1.0, -1.0 return -1.0, -1.0
# test data reader # test data reader
test_data_reader = reader.AsyncDataReader(args.val_feature_lst, test_data_reader = reader.AsyncDataReader(
args.val_label_lst) args.val_feature_lst,
args.val_label_lst,
-1,
split_sentence_threshold=1024)
test_data_reader.set_transformers(ltrans) test_data_reader.set_transformers(ltrans)
test_costs, test_accs = [], [] test_costs, test_accs = [], []
for batch_id, batch_data in enumerate( for batch_id, batch_data in enumerate(
...@@ -212,8 +220,11 @@ def train(args): ...@@ -212,8 +220,11 @@ def train(args):
return np.mean(test_costs), np.mean(test_accs) return np.mean(test_costs), np.mean(test_accs)
# train data reader # train data reader
train_data_reader = reader.AsyncDataReader(args.train_feature_lst, train_data_reader = reader.AsyncDataReader(
args.train_label_lst, -1) args.train_feature_lst,
args.train_label_lst,
-1,
split_sentence_threshold=1024)
train_data_reader.set_transformers(ltrans) train_data_reader.set_transformers(ltrans)
# train # train
......
# Reproduce DQN, DoubleDQN, DuelingDQN model with fluid version of PaddlePaddle [中文版](README_cn.md)
+ DQN in: ## Reproduce DQN, DoubleDQN, DuelingDQN model with Fluid version of PaddlePaddle
Based on PaddlePaddle's next-generation API Fluid, the DQN model of deep reinforcement learning is reproduced, and the same level of indicators of the paper is reproduced in the classic Atari game. The model receives the image of the game as input, and uses the end-to-end model to directly predict the next step. The repository contains the following three types of models.
+ DQN in
[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html) [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
+ DoubleDQN in: + DoubleDQN in:
[Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389) [Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389)
+ DuelingDQN in: + DuelingDQN in:
[Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html) [Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html)
# Atari benchmark & performance ## Atari benchmark & performance
## [Atari games introduction](https://gym.openai.com/envs/#atari) ### [Atari games introduction](https://gym.openai.com/envs/#atari)
+ Pong game result ### Pong game result
The average game rewards that can be obtained for the three models as the number of training steps changes during the training are as follows(about 3 hours/1 Million steps):
![DQN result](assets/dqn.png) ![DQN result](assets/dqn.png)
# How to use ## How to use
+ Dependencies: ### Dependencies:
+ python2.7 + python2.7
+ gym + gym
+ tqdm + tqdm
+ paddlepaddle-gpu==0.12.0 + opencv-python
+ paddlepaddle-gpu>=0.12.0
+ Start Training: + ale_python_interface
### Install Dependencies:
+ Install PaddlePaddle:
recommended to compile and install PaddlePaddle from source code
+ Install other dependencies:
```
pip install -r requirement.txt
pip install gym[atari]
``` ```
# To train a model for Pong game with gpu (use DQN model as default) Install ale_python_interface, can reference:https://github.com/mgbellemare/Arcade-Learning-Environment
python train.py --rom ./rom_files/pong.bin --use_cuda
# To train a model for Pong with DoubleDQN ### Start Training:
python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN ```
# To train a model for Pong game with gpu (use DQN model as default)
python train.py --rom ./rom_files/pong.bin --use_cuda
# To train a model for Pong with DuelingDQN # To train a model for Pong with DoubleDQN
python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
```
# To train a model for Pong with DuelingDQN
python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
```
To train more games, can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms) To train more games, can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms)
+ Start Testing: ### Start Testing:
``` ```
# Play the game with saved model and calculate the average rewards # Play the game with saved best model and calculate the average rewards
python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong
# Play the game with visualization # Play the game with visualization
python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX --viz 0.01 python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong --viz 0.01
``` ```
[Here](https://pan.baidu.com/s/1gIsbNw5V7tMeb74ojx-TMA) is saved models for Pong and Breakout games. You can use it to play the game directly.
## 基于PaddlePaddle的Fluid版本复现DQN, DoubleDQN, DuelingDQN三个模型
基于PaddlePaddle下一代API Fluid复现了深度强化学习领域的DQN模型,在经典的Atari 游戏上复现了论文同等水平的指标,模型接收游戏的图像作为输入,采用端到端的模型直接预测下一步要执行的控制信号,本仓库一共包含以下3类模型。
+ DQN模型:
[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
+ DoubleDQN模型:
[Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389)
+ DuelingDQN模型:
[Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html)
## 模型效果:Atari游戏表现
### [Atari游戏介绍](https://gym.openai.com/envs/#atari)
### Pong游戏训练结果
三个模型在训练过程中随着训练步数的变化,能得到的平均游戏奖励如下图所示(大概3小时每1百万步):
![DQN result](assets/dqn.png)
## 使用教程
### 依赖:
+ python2.7
+ gym
+ tqdm
+ opencv-python
+ paddlepaddle-gpu>=0.12.0
+ ale_python_interface
### 下载依赖:
+ 安装PaddlePaddle:
建议通过PaddlePaddle源码进行编译安装
+ 下载其它依赖:
```
pip install -r requirement.txt
pip install gym[atari]
```
安装ale_python_interface可以参考:https://github.com/mgbellemare/Arcade-Learning-Environment
### 训练模型:
```
# 使用GPU训练Pong游戏(默认使用DQN模型)
python train.py --rom ./rom_files/pong.bin --use_cuda
# 训练DoubleDQN模型
python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
# 训练DuelingDQN模型
python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
```
训练更多游戏,可以下载游戏rom从[这里](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms)
### 测试模型:
```
# Play the game with saved model and calculate the average rewards
# 使用训练过程中保存的最好模型玩游戏,以及计算平均奖励(rewards)
python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong
# 以可视化的形式来玩游戏
python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong --viz 0.01
```
[这里](https://pan.baidu.com/s/1gIsbNw5V7tMeb74ojx-TMA)是Pong和Breakout游戏训练好的模型,可以直接用来测试。
...@@ -11,7 +11,7 @@ from tqdm import tqdm ...@@ -11,7 +11,7 @@ from tqdm import tqdm
def predict_action(exe, state, predict_program, feed_names, fetch_targets, def predict_action(exe, state, predict_program, feed_names, fetch_targets,
action_dim): action_dim):
if np.random.randint(100) == 0: if np.random.random() < 0.01:
act = np.random.randint(action_dim) act = np.random.randint(action_dim)
else: else:
state = np.expand_dims(state, axis=0) state = np.expand_dims(state, axis=0)
......
numpy
gym
tqdm
opencv-python
paddlepaddle-gpu==0.12.0
...@@ -120,6 +120,9 @@ def train_agent(): ...@@ -120,6 +120,9 @@ def train_agent():
pbar = tqdm(total=1e8) pbar = tqdm(total=1e8)
recent_100_reward = [] recent_100_reward = []
total_step = 0 total_step = 0
max_reward = None
save_path = os.path.join(args.model_dirname, '{}-{}'.format(
args.alg, os.path.basename(args.rom).split('.')[0]))
while True: while True:
# start epoch # start epoch
total_reward, step = run_train_episode(agent, env, exp) total_reward, step = run_train_episode(agent, env, exp)
...@@ -134,11 +137,8 @@ def train_agent(): ...@@ -134,11 +137,8 @@ def train_agent():
print("eval_agent done, (steps, eval_reward): ({}, {})".format( print("eval_agent done, (steps, eval_reward): ({}, {})".format(
total_step, eval_reward)) total_step, eval_reward))
if total_step // args.save_every_steps == save_flag: if max_reward is None or eval_reward > max_reward:
save_flag += 1 max_reward = eval_reward
save_path = os.path.join(args.model_dirname, '{}-{}'.format(
args.alg, os.path.basename(args.rom).split('.')[0]),
'step{}'.format(total_step))
fluid.io.save_inference_model(save_path, ['state'], fluid.io.save_inference_model(save_path, ['state'],
agent.pred_value, agent.exe, agent.pred_value, agent.exe,
agent.predict_program) agent.predict_program)
...@@ -173,11 +173,6 @@ if __name__ == '__main__': ...@@ -173,11 +173,6 @@ if __name__ == '__main__':
type=str, type=str,
default='saved_model', default='saved_model',
help='dirname to save model') help='dirname to save model')
parser.add_argument(
'--save_every_steps',
type=int,
default=100000,
help='every steps number to save model')
parser.add_argument( parser.add_argument(
'--test_every_steps', '--test_every_steps',
type=int, type=int,
......
# models 简介
## 图像分类
图像分类是根据图像的语义信息对不同类别图像进行区分,是计算机视觉中重要的基础问题,是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础,在许多领域都有着广泛的应用。如:安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。
在深度学习时代,图像分类的准确率大幅度提升,在图像分类任务中,我们向大家介绍了如何在经典的数据集ImageNet上,训练常用的模型,包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual Path Network)、SE-ResNeXt模型,也开源了[训练的模型](https://github.com/PaddlePaddle/models/blob/develop/fluid/image_classification/README_cn.md#已有模型及其性能)方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle Fluid模型配置和参数文件的工具。
- [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [VGG](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [GoogleNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [MobileNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [Dual Path Network](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [SE-ResNeXt](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
- [Caffe模型转换为Paddle Fluid配置和模型文件工具](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
## 目标检测
目标检测任务的目标是给定一张图像或是一个视频帧,让计算机找出其中所有目标的位置,并给出每个目标的具体类别。对于人类来说,目标检测是一个非常简单的任务。然而,计算机能够“看到”的是图像被编码之后的数字,很难解图像或是视频帧中出现了人或是物体这样的高层语义概念,也就更加难以定位目标出现在图像中哪个区域。与此同时,由于目标会出现在图像或是视频帧中的任何位置,目标的形态千变万化,图像或是视频帧的背景千差万别,诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
在目标检测任务中,我们介绍了如何基于[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)[MS COCO](http://cocodataset.org/#home)数据的训练目标检测算法SSD,SSD全称Single Shot MultiBox Detector,是目标检测领域较新且效果较好的检测算法之一,具有检测速度快且检测精度高的特点,并开源了训练好的[MobileNet-SSD模型](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md#模型发布)
- [Single Shot MultiBox Detector](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md)
## 图像语义分割
图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割,图像语义是指对图像内容的理解,例如,能够描绘出什么物体在哪里做了什么事情等,分割是指对图片中的每个像素点进行标注,标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
在图像语义分割任务中,我们介绍如何基于图像级联网络(Image Cascade Network,ICNet)进行语义分割,相比其他分割算法,ICNet兼顾了准确率和速度。
- [ICNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/icnet)
## 场景文字识别
许多场景图像中包含着丰富的文本信息,对理解图像信息有着重要作用,能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下,将图像信息转化为文字序列的过程,可认为是一种特别的翻译过程:将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生,如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
在场景文字识别任务中,我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合,免除人工定义特征,避免字符分割,使用自动学习到的图像特征,完成端到端地无约束字符定位和识别。当前,介绍了CRNN-CTC模型,后续会引入基于注意力机制的序列到序列模型。
- [CRNN-CTC模型](https://github.com/PaddlePaddle/models/tree/develop/fluid/ocr_recognition)
## 语音识别 ## 语音识别
......
...@@ -4,4 +4,6 @@ data/ ...@@ -4,4 +4,6 @@ data/
label/ label/
*.swp *.swp
*.log *.log
infer_results/ log*
output*
infer_results*
"""
This code is based on https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
"""
import time
import numpy as np
import threading
import multiprocessing
try:
import queue
except ImportError:
import Queue as queue
class GeneratorEnqueuer(object):
"""
Builds a queue out of a data generator.
Args:
generator: a generator function which endlessly yields data
use_multiprocessing (bool): use multiprocessing if True,
otherwise use threading.
wait_time (float): time to sleep in-between calls to `put()`.
random_seed (int): Initial seed for workers,
will be incremented by one for each workers.
"""
def __init__(self,
generator,
use_multiprocessing=False,
wait_time=0.05,
random_seed=None):
self.wait_time = wait_time
self._generator = generator
self._use_multiprocessing = use_multiprocessing
self._threads = []
self._stop_event = None
self.queue = None
self._manager = None
self.seed = random_seed
def start(self, workers=1, max_queue_size=10):
"""
Start worker threads which add data from the generator into the queue.
Args:
workers (int): number of worker threads
max_queue_size (int): queue size
(when full, threads could block on `put()`)
"""
def data_generator_task():
"""
Data generator task.
"""
def task():
if (self.queue is not None and
self.queue.qsize() < max_queue_size):
generator_output = next(self._generator)
self.queue.put((generator_output))
else:
time.sleep(self.wait_time)
if not self._use_multiprocessing:
while not self._stop_event.is_set():
with self.genlock:
try:
task()
except Exception:
self._stop_event.set()
break
else:
while not self._stop_event.is_set():
try:
task()
except Exception:
self._stop_event.set()
break
try:
if self._use_multiprocessing:
self._manager = multiprocessing.Manager()
self.queue = self._manager.Queue(maxsize=max_queue_size)
self._stop_event = multiprocessing.Event()
else:
self.genlock = threading.Lock()
self.queue = queue.Queue()
self._stop_event = threading.Event()
for _ in range(workers):
if self._use_multiprocessing:
# Reset random seed else all children processes
# share the same seed
np.random.seed(self.seed)
thread = multiprocessing.Process(target=data_generator_task)
thread.daemon = True
if self.seed is not None:
self.seed += 1
else:
thread = threading.Thread(target=data_generator_task)
self._threads.append(thread)
thread.start()
except:
self.stop()
raise
def is_running(self):
"""
Returns:
bool: Whether the worker theads are running.
"""
return self._stop_event is not None and not self._stop_event.is_set()
def stop(self, timeout=None):
"""
Stops running threads and wait for them to exit, if necessary.
Should be called by the same thread which called `start()`.
Args:
timeout(int|None): maximum time to wait on `thread.join()`.
"""
if self.is_running():
self._stop_event.set()
for thread in self._threads:
if self._use_multiprocessing:
if thread.is_alive():
thread.terminate()
else:
thread.join(timeout)
if self._manager:
self._manager.shutdown()
self._threads = []
self._stop_event = None
self.queue = None
def get(self):
"""
Creates a generator to extract data from the queue.
Skip the data if it is `None`.
# Yields
tuple of data in the queue.
"""
while self.is_running():
if not self.queue.empty():
inputs = self.queue.get()
if inputs is not None:
yield inputs
else:
time.sleep(self.wait_time)
...@@ -3,6 +3,7 @@ from PIL import ImageFile ...@@ -3,6 +3,7 @@ from PIL import ImageFile
import numpy as np import numpy as np
import random import random
import math import math
import cv2
ImageFile.LOAD_TRUNCATED_IMAGES = True #otherwise IOError raised image file is truncated ImageFile.LOAD_TRUNCATED_IMAGES = True #otherwise IOError raised image file is truncated
...@@ -107,10 +108,10 @@ def data_anchor_sampling(sampler, bbox_labels, image_width, image_height, ...@@ -107,10 +108,10 @@ def data_anchor_sampling(sampler, bbox_labels, image_width, image_height,
rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
if num_gt != 0: if num_gt != 0:
norm_xmin = bbox_labels[rand_idx][0] norm_xmin = bbox_labels[rand_idx][1]
norm_ymin = bbox_labels[rand_idx][1] norm_ymin = bbox_labels[rand_idx][2]
norm_xmax = bbox_labels[rand_idx][2] norm_xmax = bbox_labels[rand_idx][3]
norm_ymax = bbox_labels[rand_idx][3] norm_ymax = bbox_labels[rand_idx][4]
xmin = norm_xmin * image_width xmin = norm_xmin * image_width
ymin = norm_ymin * image_height ymin = norm_ymin * image_height
...@@ -321,7 +322,34 @@ def transform_labels(bbox_labels, sample_bbox): ...@@ -321,7 +322,34 @@ def transform_labels(bbox_labels, sample_bbox):
return sample_labels return sample_labels
def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): def transform_labels_sampling(bbox_labels, sample_bbox, resize_val,
min_face_size):
sample_labels = []
for i in range(len(bbox_labels)):
sample_label = []
object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
bbox_labels[i][3], bbox_labels[i][4])
if not meet_emit_constraint(object_bbox, sample_bbox):
continue
proj_bbox = project_bbox(object_bbox, sample_bbox)
if proj_bbox:
real_width = float((proj_bbox.xmax - proj_bbox.xmin) * resize_val)
real_height = float((proj_bbox.ymax - proj_bbox.ymin) * resize_val)
if real_width * real_height < float(min_face_size * min_face_size):
continue
else:
sample_label.append(bbox_labels[i][0])
sample_label.append(float(proj_bbox.xmin))
sample_label.append(float(proj_bbox.ymin))
sample_label.append(float(proj_bbox.xmax))
sample_label.append(float(proj_bbox.ymax))
sample_label = sample_label + bbox_labels[i][5:]
sample_labels.append(sample_label)
return sample_labels
def crop_image(img, bbox_labels, sample_bbox, image_width, image_height,
resize_width, resize_height, min_face_size):
sample_bbox = clip_bbox(sample_bbox) sample_bbox = clip_bbox(sample_bbox)
xmin = int(sample_bbox.xmin * image_width) xmin = int(sample_bbox.xmin * image_width)
xmax = int(sample_bbox.xmax * image_width) xmax = int(sample_bbox.xmax * image_width)
...@@ -329,12 +357,15 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): ...@@ -329,12 +357,15 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
ymax = int(sample_bbox.ymax * image_height) ymax = int(sample_bbox.ymax * image_height)
sample_img = img[ymin:ymax, xmin:xmax] sample_img = img[ymin:ymax, xmin:xmax]
sample_labels = transform_labels(bbox_labels, sample_bbox) resize_val = resize_width
sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
resize_val, min_face_size)
return sample_img, sample_labels return sample_img, sample_labels
def crop_image_sampling(img, bbox_labels, sample_bbox, image_width, def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
image_height, resize_width, resize_height): image_height, resize_width, resize_height,
min_face_size):
# no clipping here # no clipping here
xmin = int(sample_bbox.xmin * image_width) xmin = int(sample_bbox.xmin * image_width)
xmax = int(sample_bbox.xmax * image_width) xmax = int(sample_bbox.xmax * image_width)
...@@ -358,14 +389,16 @@ def crop_image_sampling(img, bbox_labels, sample_bbox, image_width, ...@@ -358,14 +389,16 @@ def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
roi_width = cross_width roi_width = cross_width
roi_height = cross_height roi_height = cross_height
sample_img = np.zeros((width, height, 3)) sample_img = np.zeros((height, width, 3))
sample_img[roi_xmin : roi_xmin + roi_width, roi_ymin : roi_ymin + roi_height] = \ sample_img[int(roi_ymin) : int(roi_ymin + roi_height), int(roi_xmin) : int(roi_xmin + roi_width)] = \
img[cross_xmin : cross_xmin + cross_width, cross_ymin : cross_ymin + cross_height] img[int(cross_ymin) : int(cross_ymin + cross_height), int(cross_xmin) : int(cross_xmin + cross_width)]
sample_img = cv2.resize( sample_img = cv2.resize(
sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA) sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA)
sample_labels = transform_labels(bbox_labels, sample_bbox) resize_val = resize_width
sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
resize_val, min_face_size)
return sample_img, sample_labels return sample_img, sample_labels
......
...@@ -15,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__) ...@@ -15,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('use_gpu', bool, True, "Whether use GPU.") add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, False, "Whether use PyramidBox model.") add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('confs_threshold', float, 0.25, "Confidence threshold to draw bbox.") add_arg('confs_threshold', float, 0.25, "Confidence threshold to draw bbox.")
add_arg('image_path', str, '', "The data root path.") add_arg('image_path', str, '', "The data root path.")
add_arg('model_dir', str, '', "The model path.") add_arg('model_dir', str, '', "The model path.")
...@@ -168,6 +168,9 @@ def detect_face(image, shrink): ...@@ -168,6 +168,9 @@ def detect_face(image, shrink):
return_numpy=False) return_numpy=False)
detection = np.array(detection) detection = np.array(detection)
# layout: xmin, ymin, xmax. ymax, score # layout: xmin, ymin, xmax. ymax, score
if detection.shape == (1, ):
print("No face detected")
return np.array([[0, 0, 0, 0, 0]])
det_conf = detection[:, 1] det_conf = detection[:, 1]
det_xmin = image_shape[2] * detection[:, 2] / shrink det_xmin = image_shape[2] * detection[:, 2] / shrink
det_ymin = image_shape[1] * detection[:, 3] / shrink det_ymin = image_shape[1] * detection[:, 3] / shrink
...@@ -227,6 +230,33 @@ def multi_scale_test(image, max_shrink): ...@@ -227,6 +230,33 @@ def multi_scale_test(image, max_shrink):
return det_s, det_b return det_s, det_b
def multi_scale_test_pyramid(image, max_shrink):
# shrink detecting and shrink only detect big face
det_b = detect_face(image, 0.25)
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
> 30)[0]
det_b = det_b[index, :]
st = [0.5, 0.75, 1.25, 1.5, 1.75, 2.25]
for i in range(len(st)):
if (st[i] <= max_shrink):
det_temp = detect_face(image, st[i])
# enlarge only detect small face
if st[i] > 1:
index = np.where(
np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
det_temp = det_temp[index, :]
else:
index = np.where(
np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
det_temp = det_temp[index, :]
det_b = np.row_stack((det_b, det_temp))
return det_b
def get_im_shrink(image_shape): def get_im_shrink(image_shape):
max_shrink_v1 = (0x7fffffff / 577.0 / max_shrink_v1 = (0x7fffffff / 577.0 /
(image_shape[1] * image_shape[2]))**0.5 (image_shape[1] * image_shape[2]))**0.5
...@@ -272,7 +302,8 @@ def infer(args, batch_size, data_args): ...@@ -272,7 +302,8 @@ def infer(args, batch_size, data_args):
det0 = detect_face(image, shrink) det0 = detect_face(image, shrink)
det1 = flip_test(image, shrink) det1 = flip_test(image, shrink)
[det2, det3] = multi_scale_test(image, max_shrink) [det2, det3] = multi_scale_test(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3)) det4 = multi_scale_test_pyramid(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det) dets = bbox_vote(det)
image_name = image_path.split('/')[-1] image_name = image_path.split('/')[-1]
......
import os
import shutil
import numpy as np
import time
import argparse
import functools
import reader
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from pyramidbox import PyramidBox
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('parallel', bool, True, "parallel")
add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('batch_size', int, 20, "Minibatch size.")
add_arg('num_iteration', int, 10, "Epoch number.")
add_arg('skip_reader', bool, False, "Whether to skip data reader.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.")
add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image height.")
#yapf: enable
def train(args, config, train_file_list, optimizer_method):
learning_rate = args.learning_rate
batch_size = args.batch_size
height = args.resize_h
width = args.resize_w
use_gpu = args.use_gpu
use_pyramidbox = args.use_pyramidbox
model_save_dir = args.model_save_dir
pretrained_model = args.pretrained_model
skip_reader = args.skip_reader
num_iterations = args.num_iteration
parallel = args.parallel
num_classes = 2
image_shape = [3, height, width]
devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
devices_num = len(devices.split(","))
fetches = []
network = PyramidBox(image_shape, num_classes,
sub_network=use_pyramidbox)
if use_pyramidbox:
face_loss, head_loss, loss = network.train()
fetches = [face_loss, head_loss]
else:
loss = network.vgg_ssd_loss()
fetches = [loss]
epocs = 12880 / batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01
]
if optimizer_method == "momentum":
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=boundaries, values=values),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.0005),
)
else:
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.0005),
)
optimizer.minimize(loss)
fluid.memory_optimize(fluid.default_main_program())
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
start_pass = 0
if pretrained_model:
if pretrained_model.isdigit():
start_pass = int(pretrained_model) + 1
pretrained_model = os.path.join(model_save_dir, pretrained_model)
print("Resume from %s " %(pretrained_model))
if not os.path.exists(pretrained_model):
raise ValueError("The pre-trained model path [%s] does not exist." %
(pretrained_model))
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_gpu, loss_name=loss.name)
train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
def tensor(data, place, lod=None):
t = fluid.core.LoDTensor()
t.set(data, place)
if lod:
t.set_lod(lod)
return t
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feed_data = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
def run(iterations, feed_data):
# global feed_data
reader_time = []
run_time = []
for batch_id in range(iterations):
start_time = time.time()
if not skip_reader:
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feed_data = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
end_time = time.time()
reader_time.append(end_time - start_time)
start_time = time.time()
if parallel:
fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
feed=feed_data)
else:
fetch_vars = exe.run(fluid.default_main_program(),
feed=feed_data,
fetch_list=fetches)
end_time = time.time()
run_time.append(end_time - start_time)
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
if not args.use_pyramidbox:
print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
else:
print("Batch {0}, face loss {1}, head loss {2}".format(
batch_id, fetch_vars[0], fetch_vars[1]))
return reader_time, run_time
# start-up
run(2, feed_data)
# profiling
start = time.time()
if not parallel:
with profiler.profiler('All', 'total', '/tmp/profile_file'):
reader_time, run_time = run(num_iterations, feed_data)
else:
reader_time, run_time = run(num_iterations, feed_data)
end = time.time()
total_time = end - start
print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
total_time, np.sum(reader_time), np.sum(run_time)))
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
data_dir = 'data/WIDERFACE/WIDER_train/images/'
train_file_list = 'label/train_gt_widerface.res'
config = reader.Settings(
data_dir=data_dir,
resize_h=args.resize_h,
resize_w=args.resize_w,
apply_expand=False,
mean_value=[104., 117., 123.],
ap_version='11point')
train(args, config, train_file_list, optimizer_method="momentum")
...@@ -81,10 +81,7 @@ class PyramidBox(object): ...@@ -81,10 +81,7 @@ class PyramidBox(object):
if self.is_infer: if self.is_infer:
return [self.image] return [self.image]
else: else:
return [ return [self.image, self.face_box, self.head_box, self.gt_label]
self.image, self.face_box, self.head_box, self.gt_label,
self.difficult
]
def _input(self): def _input(self):
self.image = fluid.layers.data( self.image = fluid.layers.data(
...@@ -96,8 +93,6 @@ class PyramidBox(object): ...@@ -96,8 +93,6 @@ class PyramidBox(object):
name='head_box', shape=[4], dtype='float32', lod_level=1) name='head_box', shape=[4], dtype='float32', lod_level=1)
self.gt_label = fluid.layers.data( self.gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1) name='gt_label', shape=[1], dtype='int32', lod_level=1)
self.difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
def _vgg(self): def _vgg(self):
self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2) self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2)
...@@ -144,7 +139,8 @@ class PyramidBox(object): ...@@ -144,7 +139,8 @@ class PyramidBox(object):
stride=2, stride=2,
groups=ch, groups=ch,
param_attr=w_attr, param_attr=w_attr,
bias_attr=False) bias_attr=False,
use_cudnn=True)
else: else:
upsampling = fluid.layers.resize_bilinear( upsampling = fluid.layers.resize_bilinear(
conv1, out_shape=up_to.shape[2:]) conv1, out_shape=up_to.shape[2:])
...@@ -385,6 +381,7 @@ class PyramidBox(object): ...@@ -385,6 +381,7 @@ class PyramidBox(object):
self.box_vars, self.box_vars,
overlap_threshold=0.35, overlap_threshold=0.35,
neg_overlap=0.35) neg_overlap=0.35)
face_loss.persistable = True
head_loss = fluid.layers.ssd_loss( head_loss = fluid.layers.ssd_loss(
self.head_mbox_loc, self.head_mbox_loc,
self.head_mbox_conf, self.head_mbox_conf,
...@@ -394,9 +391,13 @@ class PyramidBox(object): ...@@ -394,9 +391,13 @@ class PyramidBox(object):
self.box_vars, self.box_vars,
overlap_threshold=0.35, overlap_threshold=0.35,
neg_overlap=0.35) neg_overlap=0.35)
head_loss.persistable = True
face_loss = fluid.layers.reduce_sum(face_loss) face_loss = fluid.layers.reduce_sum(face_loss)
face_loss.persistable = True
head_loss = fluid.layers.reduce_sum(head_loss) head_loss = fluid.layers.reduce_sum(head_loss)
head_loss.persistable = True
total_loss = face_loss + head_loss total_loss = face_loss + head_loss
total_loss.persistable = True
return face_loss, head_loss, total_loss return face_loss, head_loss, total_loss
def infer(self, main_program=None): def infer(self, main_program=None):
...@@ -410,5 +411,8 @@ class PyramidBox(object): ...@@ -410,5 +411,8 @@ class PyramidBox(object):
self.face_mbox_conf, self.face_mbox_conf,
self.prior_boxes, self.prior_boxes,
self.box_vars, self.box_vars,
nms_threshold=0.45) nms_threshold=0.3,
nms_top_k=5000,
keep_top_k=750,
score_threshold=0.05)
return test_program, face_nmsed_out return test_program, face_nmsed_out
...@@ -23,6 +23,8 @@ import os ...@@ -23,6 +23,8 @@ import os
import time import time
import copy import copy
import random import random
import cv2
from data_util import GeneratorEnqueuer
class Settings(object): class Settings(object):
...@@ -61,9 +63,29 @@ class Settings(object): ...@@ -61,9 +63,29 @@ class Settings(object):
self.brightness_delta = 0.125 self.brightness_delta = 0.125
self.scale = 0.007843 # 1 / 127.5 self.scale = 0.007843 # 1 / 127.5
self.data_anchor_sampling_prob = 0.5 self.data_anchor_sampling_prob = 0.5
self.min_face_size = 8.0
def preprocess(img, bbox_labels, mode, settings): def draw_image(faces_pred, img, resize_val):
for i in range(len(faces_pred)):
draw_rotate_rectange(img, faces_pred[i], resize_val, (0, 255, 0), 3)
def draw_rotate_rectange(img, face, resize_val, color, thickness):
cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
face[3] * resize_val), int(face[2] * resize_val)), color, thickness)
cv2.line(img, (int(face[3] * resize_val), int(face[2] * resize_val)), (int(
face[3] * resize_val), int(face[4] * resize_val)), color, thickness)
cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
cv2.line(img, (int(face[3] * resize_val), int(face[4] * resize_val)), (int(
face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
def preprocess(img, bbox_labels, mode, settings, image_path):
img_width, img_height = img.size img_width, img_height = img.size
sampled_labels = bbox_labels sampled_labels = bbox_labels
if mode == 'train': if mode == 'train':
...@@ -86,13 +108,28 @@ def preprocess(img, bbox_labels, mode, settings): ...@@ -86,13 +108,28 @@ def preprocess(img, bbox_labels, mode, settings):
batch_sampler, bbox_labels, img_width, img_height, scale_array, batch_sampler, bbox_labels, img_width, img_height, scale_array,
settings.resize_width, settings.resize_height) settings.resize_width, settings.resize_height)
img = np.array(img) img = np.array(img)
# Debug
# img_save = Image.fromarray(img)
# img_save.save('img_orig.jpg')
if len(sampled_bbox) > 0: if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox))) idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image_sampling( img, sampled_labels = image_util.crop_image_sampling(
img, bbox_labels, sampled_bbox[idx], img_width, img_height, img, bbox_labels, sampled_bbox[idx], img_width, img_height,
resize_width, resize_heigh) settings.resize_width, settings.resize_height,
settings.min_face_size)
img = img.astype('uint8')
# Debug: visualize the gt bbox
visualize_bbox = 0
if visualize_bbox:
img_show = img
draw_image(sampled_labels, img_show, settings.resize_height)
img_show = Image.fromarray(img_show)
img_show.save('final_img_show.jpg')
img = Image.fromarray(img) img = Image.fromarray(img)
# Debug
# img.save('final_img.jpg')
else: else:
# hard-code here # hard-code here
...@@ -118,7 +155,9 @@ def preprocess(img, bbox_labels, mode, settings): ...@@ -118,7 +155,9 @@ def preprocess(img, bbox_labels, mode, settings):
if len(sampled_bbox) > 0: if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox))) idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image( img, sampled_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width, img_height) img, bbox_labels, sampled_bbox[idx], img_width, img_height,
settings.resize_width, settings.resize_height,
settings.min_face_size)
img = Image.fromarray(img) img = Image.fromarray(img)
...@@ -146,20 +185,20 @@ def preprocess(img, bbox_labels, mode, settings): ...@@ -146,20 +185,20 @@ def preprocess(img, bbox_labels, mode, settings):
return img, sampled_labels return img, sampled_labels
def put_txt_in_dict(input_txt): def load_file_list(input_txt):
with open(input_txt, 'r') as f_dir: with open(input_txt, 'r') as f_dir:
lines_input_txt = f_dir.readlines() lines_input_txt = f_dir.readlines()
dict_input_txt = {} file_dict = {}
num_class = 0 num_class = 0
for i in range(len(lines_input_txt)): for i in range(len(lines_input_txt)):
tmp_line_txt = lines_input_txt[i].strip('\n\t\r') tmp_line_txt = lines_input_txt[i].strip('\n\t\r')
if '--' in tmp_line_txt: if '--' in tmp_line_txt:
if i != 0: if i != 0:
num_class += 1 num_class += 1
dict_input_txt[num_class] = [] file_dict[num_class] = []
dict_name = tmp_line_txt dict_name = tmp_line_txt
dict_input_txt[num_class].append(tmp_line_txt) file_dict[num_class].append(tmp_line_txt)
if '--' not in tmp_line_txt: if '--' not in tmp_line_txt:
if len(tmp_line_txt) > 6: if len(tmp_line_txt) > 6:
split_str = tmp_line_txt.split(' ') split_str = tmp_line_txt.split(' ')
...@@ -169,11 +208,11 @@ def put_txt_in_dict(input_txt): ...@@ -169,11 +208,11 @@ def put_txt_in_dict(input_txt):
y2_max = float(split_str[3]) y2_max = float(split_str[3])
tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str( tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
x2_max) + ' ' + str(y2_max) x2_max) + ' ' + str(y2_max)
dict_input_txt[num_class].append(tmp_line_txt) file_dict[num_class].append(tmp_line_txt)
else: else:
dict_input_txt[num_class].append(tmp_line_txt) file_dict[num_class].append(tmp_line_txt)
return dict_input_txt return file_dict
def expand_bboxes(bboxes, def expand_bboxes(bboxes,
...@@ -200,32 +239,28 @@ def expand_bboxes(bboxes, ...@@ -200,32 +239,28 @@ def expand_bboxes(bboxes,
return expand_boxes return expand_boxes
def pyramidbox(settings, file_list, mode, shuffle): def train_generator(settings, file_list, batch_size, shuffle=True):
file_dict = load_file_list(file_list)
dict_input_txt = {} while True:
dict_input_txt = put_txt_in_dict(file_list) if shuffle:
random.shuffle(file_dict)
def reader(): images, face_boxes, head_boxes, label_ids = [], [], [], []
if mode == 'train' and shuffle: label_offs = [0]
random.shuffle(dict_input_txt)
for index_image in range(len(dict_input_txt)):
image_name = dict_input_txt[index_image][0] + '.jpg' for index_image in file_dict.keys():
image_name = file_dict[index_image][0] + '.jpg'
image_path = os.path.join(settings.data_dir, image_name) image_path = os.path.join(settings.data_dir, image_name)
im = Image.open(image_path) im = Image.open(image_path)
if im.mode == 'L': if im.mode == 'L':
im = im.convert('RGB') im = im.convert('RGB')
im_width, im_height = im.size im_width, im_height = im.size
# layout: label | xmin | ymin | xmax | ymax # layout: label | xmin | ymin | xmax | ymax
if mode == 'train':
bbox_labels = [] bbox_labels = []
for index_box in range(len(dict_input_txt[index_image])): for index_box in range(len(file_dict[index_image])):
if index_box >= 2: if index_box >= 2:
bbox_sample = [] bbox_sample = []
temp_info_box = dict_input_txt[index_image][ temp_info_box = file_dict[index_image][index_box].split(' ')
index_box].split(' ')
xmin = float(temp_info_box[0]) xmin = float(temp_info_box[0])
ymin = float(temp_info_box[1]) ymin = float(temp_info_box[1])
w = float(temp_info_box[2]) w = float(temp_info_box[2])
...@@ -240,27 +275,70 @@ def pyramidbox(settings, file_list, mode, shuffle): ...@@ -240,27 +275,70 @@ def pyramidbox(settings, file_list, mode, shuffle):
bbox_sample.append(float(ymax) / im_height) bbox_sample.append(float(ymax) / im_height)
bbox_labels.append(bbox_sample) bbox_labels.append(bbox_sample)
im, sample_labels = preprocess(im, bbox_labels, mode, settings) im, sample_labels = preprocess(im, bbox_labels, "train", settings,
image_path)
sample_labels = np.array(sample_labels) sample_labels = np.array(sample_labels)
if len(sample_labels) == 0: continue if len(sample_labels) == 0: continue
im = im.astype('float32')
boxes = sample_labels[:, 1:5]
lbls = [1] * len(boxes)
difficults = [1] * len(boxes)
yield im, boxes, expand_bboxes(boxes), lbls, difficults
if mode == 'test': im = im.astype('float32')
yield im, image_path face_box = sample_labels[:, 1:5]
head_box = expand_bboxes(face_box)
return reader label = [1] * len(face_box)
images.append(im)
face_boxes.extend(face_box)
head_boxes.extend(head_box)
label_ids.extend(label)
label_offs.append(label_offs[-1] + len(face_box))
if len(images) == batch_size:
images = np.array(images).astype('float32')
face_boxes = np.array(face_boxes).astype('float32')
head_boxes = np.array(head_boxes).astype('float32')
label_ids = np.array(label_ids).astype('int32')
yield images, face_boxes, head_boxes, label_ids, label_offs
images, face_boxes, head_boxes = [], [], []
label_ids, label_offs = [], [0]
def train_batch_reader(settings,
file_list,
batch_size,
shuffle=True,
num_workers=8):
try:
enqueuer = GeneratorEnqueuer(
train_generator(settings, file_list, batch_size, shuffle),
use_multiprocessing=False)
enqueuer.start(max_queue_size=24, workers=num_workers)
generator_output = None
while True:
while enqueuer.is_running():
if not enqueuer.queue.empty():
generator_output = enqueuer.queue.get()
break
else:
time.sleep(0.01)
yield generator_output
generator_output = None
finally:
if enqueuer is not None:
enqueuer.stop()
def train(settings, file_list, shuffle=True): def test(settings, file_list):
return pyramidbox(settings, file_list, 'train', shuffle) file_dict = load_file_list(file_list)
def reader():
for index_image in file_dict.keys():
image_name = file_dict[index_image][0] + '.jpg'
image_path = os.path.join(settings.data_dir, image_name)
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
yield im, image_path
def test(settings, file_list): return reader
return pyramidbox(settings, file_list, 'test', False)
def infer(settings, image_path): def infer(settings, image_path):
......
...@@ -18,13 +18,14 @@ add_arg = functools.partial(add_arguments, argparser=parser) ...@@ -18,13 +18,14 @@ add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('parallel', bool, True, "parallel") add_arg('parallel', bool, True, "parallel")
add_arg('learning_rate', float, 0.001, "Learning rate.") add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('batch_size', int, 12, "Minibatch size.") add_arg('batch_size', int, 12, "Minibatch size.")
add_arg('num_passes', int, 120, "Epoch number.") add_arg('num_passes', int, 160, "Epoch number.")
add_arg('use_gpu', bool, True, "Whether use GPU.") add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.") add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.") add_arg('model_save_dir', str, 'output', "The path to save model.")
add_arg('pretrained_model', str, './pretrained/', "The init model path.") add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.") add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image height.") add_arg('resize_w', int, 640, "The resized image height.")
add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.")
#yapf: enable #yapf: enable
...@@ -38,6 +39,7 @@ def train(args, config, train_file_list, optimizer_method): ...@@ -38,6 +39,7 @@ def train(args, config, train_file_list, optimizer_method):
use_pyramidbox = args.use_pyramidbox use_pyramidbox = args.use_pyramidbox
model_save_dir = args.model_save_dir model_save_dir = args.model_save_dir
pretrained_model = args.pretrained_model pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
num_classes = 2 num_classes = 2
image_shape = [3, height, width] image_shape = [3, height, width]
...@@ -56,8 +58,9 @@ def train(args, config, train_file_list, optimizer_method): ...@@ -56,8 +58,9 @@ def train(args, config, train_file_list, optimizer_method):
loss = network.vgg_ssd_loss() loss = network.vgg_ssd_loss()
fetches = [loss] fetches = [loss]
epocs = 12880 / batch_size steps_per_pass = 12880 / batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100] boundaries = [steps_per_pass * 50, steps_per_pass * 80,
steps_per_pass * 120, steps_per_pass * 140]
values = [ values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01 learning_rate * 0.1, learning_rate * 0.01
...@@ -77,7 +80,8 @@ def train(args, config, train_file_list, optimizer_method): ...@@ -77,7 +80,8 @@ def train(args, config, train_file_list, optimizer_method):
) )
optimizer.minimize(loss) optimizer.minimize(loss)
#fluid.memory_optimize(fluid.default_main_program()) if with_memory_optimization:
fluid.memory_optimize(fluid.default_main_program())
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -101,9 +105,7 @@ def train(args, config, train_file_list, optimizer_method): ...@@ -101,9 +105,7 @@ def train(args, config, train_file_list, optimizer_method):
train_exe = fluid.ParallelExecutor( train_exe = fluid.ParallelExecutor(
use_cuda=use_gpu, loss_name=loss.name) use_cuda=use_gpu, loss_name=loss.name)
train_reader = paddle.batch( train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
reader.train(config, train_file_list), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=network.feeds())
def save_model(postfix): def save_model(postfix):
model_path = os.path.join(model_save_dir, postfix) model_path = os.path.join(model_save_dir, postfix)
...@@ -112,20 +114,34 @@ def train(args, config, train_file_list, optimizer_method): ...@@ -112,20 +114,34 @@ def train(args, config, train_file_list, optimizer_method):
print 'save models to %s' % (model_path) print 'save models to %s' % (model_path)
fluid.io.save_persistables(exe, model_path) fluid.io.save_persistables(exe, model_path)
def tensor(data, place, lod=None):
t = fluid.core.LoDTensor()
t.set(data, place)
if lod:
t.set_lod(lod)
return t
for pass_id in range(start_pass, num_passes): for pass_id in range(start_pass, num_passes):
start_time = time.time() start_time = time.time()
prev_start_time = start_time prev_start_time = start_time
end_time = 0 end_time = 0
for batch_id, data in enumerate(train_reader()): for batch_id in range(steps_per_pass):
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feeding = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
prev_start_time = start_time prev_start_time = start_time
start_time = time.time() start_time = time.time()
if len(data) < 2 * devices_num: continue
if args.parallel: if args.parallel:
fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches], fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
feed=feeder.feed(data)) feed=feeding)
else: else:
fetch_vars = exe.run(fluid.default_main_program(), fetch_vars = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeding,
fetch_list=fetches) fetch_list=fetches)
end_time = time.time() end_time = time.time()
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars] fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
...@@ -155,7 +171,8 @@ if __name__ == '__main__': ...@@ -155,7 +171,8 @@ if __name__ == '__main__':
data_dir=data_dir, data_dir=data_dir,
resize_h=args.resize_h, resize_h=args.resize_h,
resize_w=args.resize_w, resize_w=args.resize_w,
apply_distort=True,
apply_expand=False, apply_expand=False,
mean_value=[104., 117., 123], mean_value=[104., 117., 123.],
ap_version='11point') ap_version='11point')
train(args, config, train_file_list, optimizer_method="momentum") train(args, config, train_file_list, optimizer_method="momentum")
...@@ -38,7 +38,7 @@ class InferTaskConfig(object): ...@@ -38,7 +38,7 @@ class InferTaskConfig(object):
batch_size = 10 batch_size = 10
# the parameters for beam search. # the parameters for beam search.
beam_size = 5 beam_size = 5
max_length = 256 max_out_len = 256
# the number of decoded sentences to output. # the number of decoded sentences to output.
n_best = 1 n_best = 1
# the flags indicating whether to output the special tokens. # the flags indicating whether to output the special tokens.
...@@ -104,23 +104,28 @@ def merge_cfg_from_list(cfg_list, g_cfgs): ...@@ -104,23 +104,28 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
break break
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs. # Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in # The shapes here act as placeholder and are set to pass the infer-shape in
# compile time. # compile time.
input_descs = { input_descs = {
# The actual data shape of src_word is: # The actual data shape of src_word is:
# [batch_size * max_src_len_in_batch, 1] # [batch_size * max_src_len_in_batch, 1]
"src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], "src_word": [(batch_size * seq_len, 1L), "int64", 2],
# The actual data shape of src_pos is: # The actual data shape of src_pos is:
# [batch_size * max_src_len_in_batch, 1] # [batch_size * max_src_len_in_batch, 1]
"src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], "src_pos": [(batch_size * seq_len, 1L), "int64"],
# This input is used to remove attention weights on paddings in the # This input is used to remove attention weights on paddings in the
# encoder. # encoder.
# The actual data shape of src_slf_attn_bias is: # The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias": "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
[(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1), seq_len), "float32"],
(ModelHyperParams.max_length + 1)), "float32"],
# This shape input is used to reshape the output of embedding layer. # This shape input is used to reshape the output of embedding layer.
"src_data_shape": [(3L, ), "int32"], "src_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention. # This shape input is used to reshape before softmax in self attention.
...@@ -129,24 +134,23 @@ input_descs = { ...@@ -129,24 +134,23 @@ input_descs = {
"src_slf_attn_post_softmax_shape": [(4L, ), "int32"], "src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
# The actual data shape of trg_word is: # The actual data shape of trg_word is:
# [batch_size * max_trg_len_in_batch, 1] # [batch_size * max_trg_len_in_batch, 1]
"trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], "trg_word": [(batch_size * seq_len, 1L), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is: # The actual data shape of trg_pos is:
# [batch_size * max_trg_len_in_batch, 1] # [batch_size * max_trg_len_in_batch, 1]
"trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], "trg_pos": [(batch_size * seq_len, 1L), "int64"],
# This input is used to remove attention weights on paddings and # This input is used to remove attention weights on paddings and
# subsequent words in the decoder. # subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is: # The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(1, ModelHyperParams.n_head, "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
(ModelHyperParams.max_length + 1), seq_len), "float32"],
(ModelHyperParams.max_length + 1)), "float32"],
# This input is used to remove attention weights on paddings of the source # This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention. # input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is: # The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(1, ModelHyperParams.n_head, "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
(ModelHyperParams.max_length + 1), seq_len), "float32"],
(ModelHyperParams.max_length + 1)), "float32"],
# This shape input is used to reshape the output of embedding layer. # This shape input is used to reshape the output of embedding layer.
"trg_data_shape": [(3L, ), "int32"], "trg_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention. # This shape input is used to reshape before softmax in self attention.
...@@ -162,15 +166,18 @@ input_descs = { ...@@ -162,15 +166,18 @@ input_descs = {
# This input is used in independent decoder program for inference. # This input is used in independent decoder program for inference.
# The actual data shape of enc_output is: # The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model] # [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(1, (ModelHyperParams.max_length + 1), "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is: # The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1] # [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"], "lbl_word": [(batch_size * seq_len, 1L), "int64"],
# This input is used to mask out the loss of paddding tokens. # This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is: # The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1] # [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"], "lbl_weight": [(batch_size * seq_len, 1L), "float32"],
# These inputs are used to change the shape tensor in beam-search decoder.
"trg_slf_attn_pre_softmax_shape_delta": [(2L, ), "int32"],
"trg_slf_attn_post_softmax_shape_delta": [(4L, ), "int32"],
"init_score": [(batch_size, 1L), "float32"],
} }
# Names of word embedding table which might be reused for weight sharing. # Names of word embedding table which might be reused for weight sharing.
...@@ -205,3 +212,12 @@ decoder_util_input_fields = ( ...@@ -205,3 +212,12 @@ decoder_util_input_fields = (
label_data_input_fields = ( label_data_input_fields = (
"lbl_word", "lbl_word",
"lbl_weight", ) "lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"trg_src_attn_bias", )
fast_decoder_util_input_fields = decoder_util_input_fields + (
"trg_slf_attn_pre_softmax_shape_delta",
"trg_slf_attn_post_softmax_shape_delta", )
...@@ -7,6 +7,7 @@ import paddle.fluid as fluid ...@@ -7,6 +7,7 @@ import paddle.fluid as fluid
import model import model
from model import wrap_encoder as encoder from model import wrap_encoder as encoder
from model import wrap_decoder as decoder from model import wrap_decoder as decoder
from model import fast_decode as fast_decoder
from config import * from config import *
from train import pad_batch_data from train import pad_batch_data
import reader import reader
...@@ -87,7 +88,8 @@ def translate_batch(exe, ...@@ -87,7 +88,8 @@ def translate_batch(exe,
output_unk=True): output_unk=True):
""" """
Run the encoder program once and run the decoder program multiple times to Run the encoder program once and run the decoder program multiple times to
implement beam search externally. implement beam search externally. This is deprecated since a faster beam
search decoder based solely on Fluid operators has been added.
""" """
# Prepare data for encoder and run the encoder. # Prepare data for encoder and run the encoder.
enc_in_data = pad_batch_data( enc_in_data = pad_batch_data(
...@@ -297,7 +299,32 @@ def translate_batch(exe, ...@@ -297,7 +299,32 @@ def translate_batch(exe,
return seqs, scores[:, :n_best].tolist() return seqs, scores[:, :n_best].tolist()
def infer(args): def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx,
eos_idx=ModelHyperParams.eos_idx,
output_bos=InferTaskConfig.output_bos,
output_eos=InferTaskConfig.output_eos):
"""
Post-process the beam-search decoded sequence. Truncate from the first
<eos> and remove the <bos> and <eos> tokens currently.
"""
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
def py_infer(test_data, trg_idx2word):
"""
Inference by beam search implented by python, while the calculations from
symbols to probilities execute by Fluid operators.
"""
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -341,49 +368,8 @@ def infer(args): ...@@ -341,49 +368,8 @@ def infer(args):
fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
# This is used here to set dropout to the test mode. # This is used here to set dropout to the test mode.
encoder_program = fluid.io.get_inference_program( encoder_program = encoder_program.inference_optimize()
target_vars=[enc_output], main_program=encoder_program) decoder_program = decoder_program.inference_optimize()
decoder_program = fluid.io.get_inference_program(
target_vars=[predict], main_program=decoder_program)
test_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.test_file_pattern,
batch_size=args.batch_size,
use_token_batch=False,
pool_size=args.pool_size,
sort_type=reader.SortType.NONE,
shuffle=False,
shuffle_batch=False,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=ModelHyperParams.max_length,
clip_last_batch=False)
trg_idx2word = test_data.load_dict(
dict_path=args.trg_vocab_fpath, reverse=True)
def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx,
eos_idx=ModelHyperParams.eos_idx,
output_bos=InferTaskConfig.output_bos,
output_eos=InferTaskConfig.output_eos):
"""
Post-process the beam-search decoded sequence. Truncate from the first
<eos> and remove the <bos> and <eos> tokens currently.
"""
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
for batch_id, data in enumerate(test_data.batch_generator()): for batch_id, data in enumerate(test_data.batch_generator()):
batch_seqs, batch_scores = translate_batch( batch_seqs, batch_scores = translate_batch(
...@@ -397,7 +383,7 @@ def infer(args): ...@@ -397,7 +383,7 @@ def infer(args):
(decoder_data_input_fields[-1], ), (decoder_data_input_fields[-1], ),
[predict.name], [predict.name],
InferTaskConfig.beam_size, InferTaskConfig.beam_size,
InferTaskConfig.max_length, InferTaskConfig.max_out_len,
InferTaskConfig.n_best, InferTaskConfig.n_best,
len(data), len(data),
ModelHyperParams.n_head, ModelHyperParams.n_head,
...@@ -416,6 +402,154 @@ def infer(args): ...@@ -416,6 +402,154 @@ def infer(args):
print(" ".join([trg_idx2word[idx] for idx in seq])) print(" ".join([trg_idx2word[idx] for idx in seq]))
def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
bos_idx, n_head, d_model, place):
"""
Put all padded data needed by beam search decoder into a dict.
"""
src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
# start tokens
trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64")
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, 1, 1]).astype("float32")
# These shape tensors are used in reshape_op.
src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([-1, 1, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array(
[-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
trg_slf_attn_pre_softmax_shape = np.array(
[-1, 1], dtype="int32") # only the first time step
trg_slf_attn_post_softmax_shape = np.array(
[-1, n_head, 1, 1], dtype="int32") # only the first time step
trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array(
[-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
# These inputs are used to change the shapes in the loop of while op.
attn_pre_softmax_shape_delta = np.array([0, 1], dtype="int32")
attn_post_softmax_shape_delta = np.array([0, 0, 0, 1], dtype="int32")
def to_lodtensor(data, place, lod=None):
data_tensor = fluid.LoDTensor()
data_tensor.set(data, place)
if lod is not None:
data_tensor.set_lod(lod)
return data_tensor
# beamsearch_op must use tensors with lod
init_score = to_lodtensor(
np.zeros_like(
trg_word, dtype="float32"),
place, [range(trg_word.shape[0] + 1)] * 2)
trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2)
data_input_dict = dict(
zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
trg_src_attn_bias
]))
util_input_dict = dict(
zip(util_input_names, [
src_data_shape, src_slf_attn_pre_softmax_shape,
src_slf_attn_post_softmax_shape, trg_data_shape,
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape,
attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta
]))
input_dict = dict(data_input_dict.items() + util_input_dict.items())
return input_dict
def fast_infer(test_data, trg_idx2word):
"""
Inference by beam search decoder based solely on Fluid operators.
"""
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
out_ids, out_scores = fast_decoder(
ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
ModelHyperParams.weight_sharing, InferTaskConfig.beam_size,
InferTaskConfig.max_out_len, ModelHyperParams.eos_idx)
fluid.io.load_vars(
exe,
InferTaskConfig.model_path,
vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
fluid.default_main_program().list_vars()))
# This is used here to set dropout to the test mode.
infer_program = fluid.default_main_program().inference_optimize()
for batch_id, data in enumerate(test_data.batch_generator()):
data_input = prepare_batch_input(
data, encoder_data_input_fields + fast_decoder_data_input_fields,
encoder_util_input_fields + fast_decoder_util_input_fields,
ModelHyperParams.eos_idx, ModelHyperParams.bos_idx,
ModelHyperParams.n_head, ModelHyperParams.d_model, place)
seq_ids, seq_scores = exe.run(infer_program,
feed=data_input,
fetch_list=[out_ids, out_scores],
return_numpy=False)
# How to parse the results:
# Suppose the lod of seq_ids is:
# [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
# then from lod[0]:
# there are 2 source sentences, beam width is 3.
# from lod[1]:
# the first source sentence has 3 hyps; the lengths are 12, 12, 16
# the second source sentence has 3 hyps; the lengths are 14, 13, 15
hyps = [[] for i in range(len(data))]
scores = [[] for i in range(len(data))]
for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence
start = seq_ids.lod()[0][i]
end = seq_ids.lod()[0][i + 1]
for j in range(end - start): # for each candidate
sub_start = seq_ids.lod()[1][start + j]
sub_end = seq_ids.lod()[1][start + j + 1]
hyps[i].append(" ".join([
trg_idx2word[idx]
for idx in post_process_seq(
np.array(seq_ids)[sub_start:sub_end])
]))
scores[i].append(np.array(seq_scores)[sub_end - 1])
print hyps[i][-1]
if len(hyps[i]) >= InferTaskConfig.n_best:
break
def infer(args, inferencer=fast_infer):
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
test_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.test_file_pattern,
batch_size=args.batch_size,
use_token_batch=False,
pool_size=args.pool_size,
sort_type=reader.SortType.NONE,
shuffle=False,
shuffle_batch=False,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=ModelHyperParams.max_length,
clip_last_batch=False)
trg_idx2word = test_data.load_dict(
dict_path=args.trg_vocab_fpath, reverse=True)
inferencer(test_data, trg_idx2word)
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
infer(args) infer(args)
...@@ -30,7 +30,8 @@ def multi_head_attention(queries, ...@@ -30,7 +30,8 @@ def multi_head_attention(queries,
n_head=1, n_head=1,
dropout_rate=0., dropout_rate=0.,
pre_softmax_shape=None, pre_softmax_shape=None,
post_softmax_shape=None): post_softmax_shape=None,
cache=None):
""" """
Multi-Head Attention. Note that attn_bias is added to the logit before Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that computing softmax activiation to mask certain selected positions so that
...@@ -116,6 +117,10 @@ def multi_head_attention(queries, ...@@ -116,6 +117,10 @@ def multi_head_attention(queries,
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
k = cache["k"] = layers.concat([cache["k"], k], axis=1)
v = cache["v"] = layers.concat([cache["v"], v], axis=1)
q = __split_heads(q, n_head) q = __split_heads(q, n_head)
k = __split_heads(k, n_head) k = __split_heads(k, n_head)
v = __split_heads(v, n_head) v = __split_heads(v, n_head)
...@@ -203,7 +208,7 @@ def prepare_encoder(src_word, ...@@ -203,7 +208,7 @@ def prepare_encoder(src_word,
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
enc_input = layers.reshape( enc_input = layers.reshape(
x=enc_input, x=enc_input,
shape=[-1, src_max_len, src_emb_dim], shape=[batch_size, seq_len, src_emb_dim],
actual_shape=src_data_shape) actual_shape=src_data_shape)
return layers.dropout( return layers.dropout(
enc_input, dropout_prob=dropout_rate, enc_input, dropout_prob=dropout_rate,
...@@ -285,7 +290,8 @@ def decoder_layer(dec_input, ...@@ -285,7 +290,8 @@ def decoder_layer(dec_input,
slf_attn_pre_softmax_shape=None, slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None, slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None, src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None): src_attn_post_softmax_shape=None,
cache=None):
""" The layer to be stacked in decoder part. """ The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention. a multi-head attention is added to implement encoder-decoder attention.
...@@ -301,7 +307,8 @@ def decoder_layer(dec_input, ...@@ -301,7 +307,8 @@ def decoder_layer(dec_input,
n_head, n_head,
dropout_rate, dropout_rate,
slf_attn_pre_softmax_shape, slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, ) slf_attn_post_softmax_shape,
cache, )
slf_attn_output = post_process_layer( slf_attn_output = post_process_layer(
dec_input, dec_input,
slf_attn_output, slf_attn_output,
...@@ -350,7 +357,8 @@ def decoder(dec_input, ...@@ -350,7 +357,8 @@ def decoder(dec_input,
slf_attn_pre_softmax_shape=None, slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None, slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None, src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None): src_attn_post_softmax_shape=None,
caches=None):
""" """
The decoder is composed of a stack of identical decoder_layer layers. The decoder is composed of a stack of identical decoder_layer layers.
""" """
...@@ -369,7 +377,8 @@ def decoder(dec_input, ...@@ -369,7 +377,8 @@ def decoder(dec_input,
slf_attn_pre_softmax_shape, slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape, src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, ) src_attn_post_softmax_shape,
None if caches is None else caches[i], )
dec_input = dec_output dec_input = dec_output
return dec_output return dec_output
...@@ -384,6 +393,8 @@ def make_all_inputs(input_fields): ...@@ -384,6 +393,8 @@ def make_all_inputs(input_fields):
name=input_field, name=input_field,
shape=input_descs[input_field][0], shape=input_descs[input_field][0],
dtype=input_descs[input_field][1], dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False) append_batch_size=False)
inputs.append(input_var) inputs.append(input_var)
return inputs return inputs
...@@ -517,7 +528,8 @@ def wrap_decoder(trg_vocab_size, ...@@ -517,7 +528,8 @@ def wrap_decoder(trg_vocab_size,
dropout_rate, dropout_rate,
weight_sharing, weight_sharing,
dec_inputs=None, dec_inputs=None,
enc_output=None): enc_output=None,
caches=None):
""" """
The wrapper assembles together all needed layers for the decoder. The wrapper assembles together all needed layers for the decoder.
""" """
...@@ -559,7 +571,8 @@ def wrap_decoder(trg_vocab_size, ...@@ -559,7 +571,8 @@ def wrap_decoder(trg_vocab_size,
slf_attn_pre_softmax_shape, slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape, src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, ) src_attn_post_softmax_shape,
caches, )
# Return logits for training and probs for inference. # Return logits for training and probs for inference.
if weight_sharing: if weight_sharing:
predict = layers.reshape( predict = layers.reshape(
...@@ -578,3 +591,145 @@ def wrap_decoder(trg_vocab_size, ...@@ -578,3 +591,145 @@ def wrap_decoder(trg_vocab_size,
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
act="softmax" if dec_inputs is None else None) act="softmax" if dec_inputs is None else None)
return predict return predict
def fast_decode(
src_vocab_size,
trg_vocab_size,
max_in_len,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
weight_sharing,
beam_size,
max_out_len,
eos_idx, ):
"""
Use beam search to decode. Caches will be used to store states of history
steps which can make the decoding faster.
"""
enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid,
dropout_rate, weight_sharing)
start_tokens, init_scores, trg_src_attn_bias, trg_data_shape, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \
src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \
attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta = \
make_all_inputs(fast_decoder_data_input_fields +
fast_decoder_util_input_fields)
def beam_search():
max_len = layers.fill_constant(
shape=[1], dtype=start_tokens.dtype, value=max_out_len)
step_idx = layers.fill_constant(
shape=[1], dtype=start_tokens.dtype, value=0)
cond = layers.less_than(x=step_idx, y=max_len)
while_op = layers.While(cond)
# array states will be stored for each step.
ids = layers.array_write(start_tokens, step_idx)
scores = layers.array_write(init_scores, step_idx)
# cell states will be overwrited at each step.
# caches contains states of history steps to reduce redundant
# computation in decoder.
caches = [{
"k": layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, 0, d_model],
dtype=enc_output.dtype,
value=0),
"v": layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, 0, d_model],
dtype=enc_output.dtype,
value=0)
} for i in range(n_layer)]
with while_op.block():
pre_ids = layers.array_read(array=ids, i=step_idx)
pre_scores = layers.array_read(array=scores, i=step_idx)
# sequence_expand can gather sequences according to lod thus can be
# used in beam search to sift states corresponding to selected ids.
pre_src_attn_bias = layers.sequence_expand(
x=trg_src_attn_bias, y=pre_scores)
pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
pre_caches = [{
"k": layers.sequence_expand(
x=cache["k"], y=pre_scores),
"v": layers.sequence_expand(
x=cache["v"], y=pre_scores),
} for cache in caches]
pre_pos = layers.elementwise_mul(
x=layers.fill_constant_batch_size_like(
input=pre_enc_output, # cann't use pre_ids here since it has lod
value=1,
shape=[-1, 1],
dtype=pre_ids.dtype),
y=layers.increment(
x=step_idx, value=1.0, in_place=False),
axis=0)
logits = wrap_decoder(
trg_vocab_size,
max_in_len,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
weight_sharing,
dec_inputs=(
pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
enc_output=pre_enc_output,
caches=pre_caches)
topk_scores, topk_indices = layers.topk(
input=layers.softmax(logits), k=beam_size)
accu_scores = layers.elementwise_add(
x=layers.log(topk_scores),
y=layers.reshape(
pre_scores, shape=[-1]),
axis=0)
# beam_search op uses lod to distinguish branches.
topk_indices = layers.lod_reset(topk_indices, pre_ids)
selected_ids, selected_scores = layers.beam_search(
pre_ids=pre_ids,
pre_scores=pre_scores,
ids=topk_indices,
scores=accu_scores,
beam_size=beam_size,
end_id=eos_idx)
layers.increment(x=step_idx, value=1.0, in_place=True)
# update states
layers.array_write(selected_ids, i=step_idx, array=ids)
layers.array_write(selected_scores, i=step_idx, array=scores)
layers.assign(pre_src_attn_bias, trg_src_attn_bias)
layers.assign(pre_enc_output, enc_output)
for i in range(n_layer):
layers.assign(pre_caches[i]["k"], caches[i]["k"])
layers.assign(pre_caches[i]["v"], caches[i]["v"])
layers.assign(
layers.elementwise_add(
x=slf_attn_pre_softmax_shape,
y=attn_pre_softmax_shape_delta),
slf_attn_pre_softmax_shape)
layers.assign(
layers.elementwise_add(
x=slf_attn_post_softmax_shape,
y=attn_post_softmax_shape_delta),
slf_attn_post_softmax_shape)
length_cond = layers.less_than(x=step_idx, y=max_len)
finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
finished_ids, finished_scores = layers.beam_search_decode(
ids, scores, beam_size=beam_size, end_id=eos_idx)
return finished_ids, finished_scores
finished_ids, finished_scores = beam_search()
return finished_ids, finished_scores
...@@ -198,7 +198,8 @@ class DataReader(object): ...@@ -198,7 +198,8 @@ class DataReader(object):
for line in f_obj: for line in f_obj:
fields = line.strip().split(self._delimiter) fields = line.strip().split(self._delimiter)
if len(fields) != 2 or (self._only_src and len(fields) != 1): if (not self._only_src and len(fields) != 2) or (self._only_src and
len(fields) != 1):
continue continue
sample_words = [] sample_words = []
...@@ -275,7 +276,7 @@ class DataReader(object): ...@@ -275,7 +276,7 @@ class DataReader(object):
for sample_idx in self._sample_idxs: for sample_idx in self._sample_idxs:
if self._only_src: if self._only_src:
yield (self._src_seq_ids[sample_idx]) yield (self._src_seq_ids[sample_idx], )
else: else:
yield (self._src_seq_ids[sample_idx], yield (self._src_seq_ids[sample_idx],
self._trg_seq_ids[sample_idx][:-1], self._trg_seq_ids[sample_idx][:-1],
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册