diff --git a/fluid/DeepASR/README_cn.md b/fluid/DeepASR/README_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..be78a048701a621bd90942bdfe30ef4d7c7f082f --- /dev/null +++ b/fluid/DeepASR/README_cn.md @@ -0,0 +1,186 @@ +运行本目录下的程序示例需要使用 PaddlePaddle v0.14及以上版本。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新 PaddlePaddle 安装版本。 + +--- + +DeepASR (Deep Automatic Speech Recognition) 是一个基于PaddlePaddle FLuid与[Kaldi](http://www.kaldi-asr.org)的语音识别系统。其利用Fluid框架完成语音识别中声学模型的配置和训练,并集成 Kaldi 的解码器。旨在方便已对 Kaldi 的较为熟悉的用户实现中声学模型的快速、大规模训练,并利用kaldi完成复杂的语音数据预处理和最终的解码过程。 + +### 目录 +- [模型概览](#model-overview) +- [安装](#installation) +- [数据预处理](#data-reprocessing) +- [模型训练](#training) +- [训练过程中的时间分析](#perf-profiling) +- [预测和解码](#infer-decoding) +- [评估错误率](#scoring-error-rate) +- [Aishell 实例](#aishell-example) +- [欢迎贡献更多的实例](#how-to-contrib) + +### 模型概览 + +DeepASR的声学模型是一个单卷积层加多层层叠LSTMP 的结构,利用卷积来进行初步的特征提取,并用多层的LSTMP来对时序关系进行建模,所用到的损失函数是交叉熵。[LSTMP](https://arxiv.org/abs/1402.1128)(LSTM with recurrent projection layer)是传统 LSTM 的拓展,在 LSTM 的基础上增加了一个映射层,将隐含层映射到较低的维度并输入下一个时间步,这种结构在大为减小 LSTM 的参数规模和计算复杂度的同时还提升了 LSTM 的性能表现。 + +
+
+图1 LSTMP 的拓扑结构
+
+
+图2 在Aishell数据集上训练声学模型的学习曲线
+
+
+
+
+Pyramidbox 人脸检测模型
+
+
+Pyramidbox 人脸检测性能展示
+
+
+
+WIDER FACE Easy/Medium/Hard set
+
+
+> 目前,基于PaddlePaddle的实现过程中模型参数仍在调优,比上图更优的结果会在后续发布
diff --git a/fluid/face_detection/data/download.sh b/fluid/face_detection/data/download.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa32b53dd44f4286b4a6e24fba75f098d797487f
--- /dev/null
+++ b/fluid/face_detection/data/download.sh
@@ -0,0 +1,8 @@
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+echo "Downloading..."
+wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/bbx_annotation/wider_face_split.zip
+
+echo "Extracting..."
+unzip wider_face_split.zip && rm -f wider_face_split.zip
diff --git a/fluid/face_detection/data_util.py b/fluid/face_detection/data_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac022593119e0008c3f7f3858303cbf5bc717650
--- /dev/null
+++ b/fluid/face_detection/data_util.py
@@ -0,0 +1,151 @@
+"""
+This code is based on https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
+"""
+
+import time
+import numpy as np
+import threading
+import multiprocessing
+try:
+ import queue
+except ImportError:
+ import Queue as queue
+
+
+class GeneratorEnqueuer(object):
+ """
+ Builds a queue out of a data generator.
+
+ Args:
+ generator: a generator function which endlessly yields data
+ use_multiprocessing (bool): use multiprocessing if True,
+ otherwise use threading.
+ wait_time (float): time to sleep in-between calls to `put()`.
+ random_seed (int): Initial seed for workers,
+ will be incremented by one for each workers.
+ """
+
+ def __init__(self,
+ generator,
+ use_multiprocessing=False,
+ wait_time=0.05,
+ random_seed=None):
+ self.wait_time = wait_time
+ self._generator = generator
+ self._use_multiprocessing = use_multiprocessing
+ self._threads = []
+ self._stop_event = None
+ self.queue = None
+ self._manager = None
+ self.seed = random_seed
+
+ def start(self, workers=1, max_queue_size=10):
+ """
+ Start worker threads which add data from the generator into the queue.
+
+ Args:
+ workers (int): number of worker threads
+ max_queue_size (int): queue size
+ (when full, threads could block on `put()`)
+ """
+
+ def data_generator_task():
+ """
+ Data generator task.
+ """
+
+ def task():
+ if (self.queue is not None and
+ self.queue.qsize() < max_queue_size):
+ generator_output = next(self._generator)
+ self.queue.put((generator_output))
+ else:
+ time.sleep(self.wait_time)
+
+ if not self._use_multiprocessing:
+ while not self._stop_event.is_set():
+ with self.genlock:
+ try:
+ task()
+ except Exception:
+ self._stop_event.set()
+ break
+ else:
+ while not self._stop_event.is_set():
+ try:
+ task()
+ except Exception:
+ self._stop_event.set()
+ break
+
+ try:
+ if self._use_multiprocessing:
+ self._manager = multiprocessing.Manager()
+ self.queue = self._manager.Queue(maxsize=max_queue_size)
+ self._stop_event = multiprocessing.Event()
+ else:
+ self.genlock = threading.Lock()
+ self.queue = queue.Queue()
+ self._stop_event = threading.Event()
+ for _ in range(workers):
+ if self._use_multiprocessing:
+ # Reset random seed else all children processes
+ # share the same seed
+ np.random.seed(self.seed)
+ thread = multiprocessing.Process(target=data_generator_task)
+ thread.daemon = True
+ if self.seed is not None:
+ self.seed += 1
+ else:
+ thread = threading.Thread(target=data_generator_task)
+ self._threads.append(thread)
+ thread.start()
+ except:
+ self.stop()
+ raise
+
+ def is_running(self):
+ """
+ Returns:
+ bool: Whether the worker theads are running.
+ """
+ return self._stop_event is not None and not self._stop_event.is_set()
+
+ def stop(self, timeout=None):
+ """
+ Stops running threads and wait for them to exit, if necessary.
+ Should be called by the same thread which called `start()`.
+
+ Args:
+ timeout(int|None): maximum time to wait on `thread.join()`.
+ """
+ if self.is_running():
+ self._stop_event.set()
+ for thread in self._threads:
+ if self._use_multiprocessing:
+ if thread.is_alive():
+ thread.terminate()
+ else:
+ thread.join(timeout)
+ if self._manager:
+ self._manager.shutdown()
+
+ self._threads = []
+ self._stop_event = None
+ self.queue = None
+
+ def get(self):
+ """
+ Creates a generator to extract data from the queue.
+ Skip the data if it is `None`.
+
+ # Yields
+ tuple of data in the queue.
+ """
+ while self.is_running():
+ if not self.queue.empty():
+ inputs = self.queue.get()
+ if inputs is not None:
+ yield inputs
+ else:
+ time.sleep(self.wait_time)
diff --git a/fluid/face_detection/image_util.py b/fluid/face_detection/image_util.py
index f39538285637c1a284c4058130be40d89435dcef..8f3728a90402f07665c2678a2eae3e86bb128068 100644
--- a/fluid/face_detection/image_util.py
+++ b/fluid/face_detection/image_util.py
@@ -131,12 +131,13 @@ def data_anchor_sampling(sampler, bbox_labels, image_width, image_height,
rand_idx_size = range_size + 1
else:
# np.random.randint range: [low, high)
- rng_rand_size = np.random.randint(0, range_size)
- rand_idx_size = rng_rand_size % range_size
-
- scale_choose = random.uniform(scale_array[rand_idx_size] / 2.0,
- 2.0 * scale_array[rand_idx_size])
+ rng_rand_size = np.random.randint(0, range_size + 1)
+ rand_idx_size = rng_rand_size % (range_size + 1)
+ min_resize_val = scale_array[rand_idx_size] / 2.0
+ max_resize_val = min(2.0 * scale_array[rand_idx_size],
+ 2 * math.sqrt(wid * hei))
+ scale_choose = random.uniform(min_resize_val, max_resize_val)
sample_bbox_size = wid * resize_width / scale_choose
w_off_orig = 0.0
@@ -389,9 +390,19 @@ def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
roi_width = cross_width
roi_height = cross_height
+ roi_y1 = int(roi_ymin)
+ roi_y2 = int(roi_ymin + roi_height)
+ roi_x1 = int(roi_xmin)
+ roi_x2 = int(roi_xmin + roi_width)
+
+ cross_y1 = int(cross_ymin)
+ cross_y2 = int(cross_ymin + cross_height)
+ cross_x1 = int(cross_xmin)
+ cross_x2 = int(cross_xmin + cross_width)
+
sample_img = np.zeros((height, width, 3))
- sample_img[int(roi_ymin) : int(roi_ymin + roi_height), int(roi_xmin) : int(roi_xmin + roi_width)] = \
- img[int(cross_ymin) : int(cross_ymin + cross_height), int(cross_xmin) : int(cross_xmin + cross_width)]
+ sample_img[roi_y1 : roi_y2, roi_x1 : roi_x2] = \
+ img[cross_y1 : cross_y2, cross_x1 : cross_x2]
sample_img = cv2.resize(
sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA)
diff --git a/fluid/face_detection/images/architecture_of_pyramidbox.jpg b/fluid/face_detection/images/architecture_of_pyramidbox.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d453ce3d80478c1606d784056cea5e9d599f5120
Binary files /dev/null and b/fluid/face_detection/images/architecture_of_pyramidbox.jpg differ
diff --git a/fluid/face_detection/images/demo_img.jpg b/fluid/face_detection/images/demo_img.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4d950e723a01aa32e2b848333ef903bcb8779d8f
Binary files /dev/null and b/fluid/face_detection/images/demo_img.jpg differ
diff --git a/fluid/face_detection/images/wider_pr_cruve_int_easy_val.jpg b/fluid/face_detection/images/wider_pr_cruve_int_easy_val.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..29f902491ea35a527fb3d5822e5bc3a7c4d976cb
Binary files /dev/null and b/fluid/face_detection/images/wider_pr_cruve_int_easy_val.jpg differ
diff --git a/fluid/face_detection/images/wider_pr_cruve_int_hard_val.jpg b/fluid/face_detection/images/wider_pr_cruve_int_hard_val.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..58f941be640c130bb7cfdf013f6e61d1ca948dba
Binary files /dev/null and b/fluid/face_detection/images/wider_pr_cruve_int_hard_val.jpg differ
diff --git a/fluid/face_detection/images/wider_pr_cruve_int_medium_val.jpg b/fluid/face_detection/images/wider_pr_cruve_int_medium_val.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3c21b78e059e8185b8ad458ebf1c0e88aa3d993e
Binary files /dev/null and b/fluid/face_detection/images/wider_pr_cruve_int_medium_val.jpg differ
diff --git a/fluid/face_detection/profile.py b/fluid/face_detection/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd686ad0784abd730d41263e3982560345ca6908
--- /dev/null
+++ b/fluid/face_detection/profile.py
@@ -0,0 +1,190 @@
+import os
+import shutil
+import numpy as np
+import time
+import argparse
+import functools
+
+import reader
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from pyramidbox import PyramidBox
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('parallel', bool, True, "parallel")
+add_arg('learning_rate', float, 0.001, "Learning rate.")
+add_arg('batch_size', int, 20, "Minibatch size.")
+add_arg('num_iteration', int, 10, "Epoch number.")
+add_arg('skip_reader', bool, False, "Whether to skip data reader.")
+add_arg('use_gpu', bool, True, "Whether use GPU.")
+add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
+add_arg('model_save_dir', str, 'output', "The path to save model.")
+add_arg('pretrained_model', str, './pretrained/', "The init model path.")
+add_arg('resize_h', int, 640, "The resized image height.")
+add_arg('resize_w', int, 640, "The resized image height.")
+#yapf: enable
+
+
+def train(args, config, train_file_list, optimizer_method):
+ learning_rate = args.learning_rate
+ batch_size = args.batch_size
+ height = args.resize_h
+ width = args.resize_w
+ use_gpu = args.use_gpu
+ use_pyramidbox = args.use_pyramidbox
+ model_save_dir = args.model_save_dir
+ pretrained_model = args.pretrained_model
+ skip_reader = args.skip_reader
+ num_iterations = args.num_iteration
+ parallel = args.parallel
+
+ num_classes = 2
+ image_shape = [3, height, width]
+
+ devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+ devices_num = len(devices.split(","))
+
+ fetches = []
+ network = PyramidBox(image_shape, num_classes,
+ sub_network=use_pyramidbox)
+ if use_pyramidbox:
+ face_loss, head_loss, loss = network.train()
+ fetches = [face_loss, head_loss]
+ else:
+ loss = network.vgg_ssd_loss()
+ fetches = [loss]
+
+ epocs = 12880 / batch_size
+ boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
+ values = [
+ learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+ learning_rate * 0.1, learning_rate * 0.01
+ ]
+
+ if optimizer_method == "momentum":
+ optimizer = fluid.optimizer.Momentum(
+ learning_rate=fluid.layers.piecewise_decay(
+ boundaries=boundaries, values=values),
+ momentum=0.9,
+ regularization=fluid.regularizer.L2Decay(0.0005),
+ )
+ else:
+ optimizer = fluid.optimizer.RMSProp(
+ learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+ regularization=fluid.regularizer.L2Decay(0.0005),
+ )
+
+ optimizer.minimize(loss)
+ fluid.memory_optimize(fluid.default_main_program())
+
+ place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+
+ start_pass = 0
+ if pretrained_model:
+ if pretrained_model.isdigit():
+ start_pass = int(pretrained_model) + 1
+ pretrained_model = os.path.join(model_save_dir, pretrained_model)
+ print("Resume from %s " %(pretrained_model))
+
+ if not os.path.exists(pretrained_model):
+ raise ValueError("The pre-trained model path [%s] does not exist." %
+ (pretrained_model))
+ def if_exist(var):
+ return os.path.exists(os.path.join(pretrained_model, var.name))
+ fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+ if parallel:
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=use_gpu, loss_name=loss.name)
+
+ train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
+
+ def tensor(data, place, lod=None):
+ t = fluid.core.LoDTensor()
+ t.set(data, place)
+ if lod:
+ t.set_lod(lod)
+ return t
+
+ im, face_box, head_box, labels, lod = next(train_reader)
+ im_t = tensor(im, place)
+ box1 = tensor(face_box, place, [lod])
+ box2 = tensor(head_box, place, [lod])
+ lbl_t = tensor(labels, place, [lod])
+ feed_data = {'image': im_t, 'face_box': box1,
+ 'head_box': box2, 'gt_label': lbl_t}
+
+ def run(iterations, feed_data):
+ # global feed_data
+ reader_time = []
+ run_time = []
+ for batch_id in range(iterations):
+ start_time = time.time()
+ if not skip_reader:
+ im, face_box, head_box, labels, lod = next(train_reader)
+ im_t = tensor(im, place)
+ box1 = tensor(face_box, place, [lod])
+ box2 = tensor(head_box, place, [lod])
+ lbl_t = tensor(labels, place, [lod])
+ feed_data = {'image': im_t, 'face_box': box1,
+ 'head_box': box2, 'gt_label': lbl_t}
+ end_time = time.time()
+ reader_time.append(end_time - start_time)
+
+ start_time = time.time()
+ if parallel:
+ fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
+ feed=feed_data)
+ else:
+ fetch_vars = exe.run(fluid.default_main_program(),
+ feed=feed_data,
+ fetch_list=fetches)
+ end_time = time.time()
+ run_time.append(end_time - start_time)
+ fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
+ if not args.use_pyramidbox:
+ print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
+ else:
+ print("Batch {0}, face loss {1}, head loss {2}".format(
+ batch_id, fetch_vars[0], fetch_vars[1]))
+
+ return reader_time, run_time
+
+ # start-up
+ run(2, feed_data)
+
+ # profiling
+ start = time.time()
+ if not parallel:
+ with profiler.profiler('All', 'total', '/tmp/profile_file'):
+ reader_time, run_time = run(num_iterations, feed_data)
+ else:
+ reader_time, run_time = run(num_iterations, feed_data)
+ end = time.time()
+ total_time = end - start
+ print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
+ total_time, np.sum(reader_time), np.sum(run_time)))
+
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ print_arguments(args)
+
+ data_dir = 'data/WIDERFACE/WIDER_train/images/'
+ train_file_list = 'label/train_gt_widerface.res'
+
+ config = reader.Settings(
+ data_dir=data_dir,
+ resize_h=args.resize_h,
+ resize_w=args.resize_w,
+ apply_expand=False,
+ mean_value=[104., 117., 123.],
+ ap_version='11point')
+ train(args, config, train_file_list, optimizer_method="momentum")
diff --git a/fluid/face_detection/pyramidbox.py b/fluid/face_detection/pyramidbox.py
index 74641f62eff18772337849f521269fecf9cef912..ba1a99356003f3482fcaf87874bb0cabd5733762 100644
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -52,7 +52,7 @@ def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True):
class PyramidBox(object):
def __init__(self,
data_shape,
- num_classes,
+ num_classes=None,
use_transposed_conv2d=True,
is_infer=False,
sub_network=False):
@@ -81,10 +81,7 @@ class PyramidBox(object):
if self.is_infer:
return [self.image]
else:
- return [
- self.image, self.face_box, self.head_box, self.gt_label,
- self.difficult
- ]
+ return [self.image, self.face_box, self.head_box, self.gt_label]
def _input(self):
self.image = fluid.layers.data(
@@ -96,8 +93,6 @@ class PyramidBox(object):
name='head_box', shape=[4], dtype='float32', lod_level=1)
self.gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1)
- self.difficult = fluid.layers.data(
- name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
def _vgg(self):
self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2)
@@ -144,7 +139,8 @@ class PyramidBox(object):
stride=2,
groups=ch,
param_attr=w_attr,
- bias_attr=False)
+ bias_attr=False,
+ use_cudnn=True)
else:
upsampling = fluid.layers.resize_bilinear(
conv1, out_shape=up_to.shape[2:])
@@ -418,5 +414,5 @@ class PyramidBox(object):
nms_threshold=0.3,
nms_top_k=5000,
keep_top_k=750,
- score_threshold=0.05)
+ score_threshold=0.01)
return test_program, face_nmsed_out
diff --git a/fluid/face_detection/reader.py b/fluid/face_detection/reader.py
index 5db54a010a266823c7f00ca1be654f70b9980244..5ac6e506f4cf2d45e3b5ee688492787a99f9264c 100644
--- a/fluid/face_detection/reader.py
+++ b/fluid/face_detection/reader.py
@@ -24,6 +24,7 @@ import time
import copy
import random
import cv2
+from data_util import GeneratorEnqueuer
class Settings(object):
@@ -58,30 +59,25 @@ class Settings(object):
self.saturation_delta = 0.5
self.brightness_prob = 0.5
# _brightness_delta is the normalized value by 256
- # self._brightness_delta = 32
self.brightness_delta = 0.125
self.scale = 0.007843 # 1 / 127.5
self.data_anchor_sampling_prob = 0.5
self.min_face_size = 8.0
-def draw_image(faces_pred, img, resize_val):
- for i in range(len(faces_pred)):
- draw_rotate_rectange(img, faces_pred[i], resize_val, (0, 255, 0), 3)
-
-
-def draw_rotate_rectange(img, face, resize_val, color, thickness):
- cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
- face[3] * resize_val), int(face[2] * resize_val)), color, thickness)
-
- cv2.line(img, (int(face[3] * resize_val), int(face[2] * resize_val)), (int(
- face[3] * resize_val), int(face[4] * resize_val)), color, thickness)
-
- cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
- face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
-
- cv2.line(img, (int(face[3] * resize_val), int(face[4] * resize_val)), (int(
- face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
+def to_chw_bgr(image):
+ """
+ Transpose image from HWC to CHW and from RBG to BGR.
+ Args:
+ image (np.array): an image with HWC and RBG layout.
+ """
+ # HWC to CHW
+ if len(image.shape) == 3:
+ image = np.swapaxes(image, 1, 2)
+ image = np.swapaxes(image, 1, 0)
+ # RBG to BGR
+ image = image[[2, 1, 0], :, :]
+ return image
def preprocess(img, bbox_labels, mode, settings, image_path):
@@ -107,9 +103,6 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
batch_sampler, bbox_labels, img_width, img_height, scale_array,
settings.resize_width, settings.resize_height)
img = np.array(img)
- # Debug
- # img_save = Image.fromarray(img)
- # img_save.save('img_orig.jpg')
if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image_sampling(
@@ -118,17 +111,7 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
settings.min_face_size)
img = img.astype('uint8')
- # Debug: visualize the gt bbox
- visualize_bbox = 0
- if visualize_bbox:
- img_show = img
- draw_image(sampled_labels, img_show, settings.resize_height)
- img_show = Image.fromarray(img_show)
- img_show.save('final_img_show.jpg')
-
img = Image.fromarray(img)
- # Debug
- # img.save('final_img.jpg')
else:
# hard-code here
@@ -172,46 +155,41 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
tmp = sampled_labels[i][1]
sampled_labels[i][1] = 1 - sampled_labels[i][3]
sampled_labels[i][3] = 1 - tmp
- # HWC to CHW
- if len(img.shape) == 3:
- img = np.swapaxes(img, 1, 2)
- img = np.swapaxes(img, 1, 0)
- # RBG to BGR
- img = img[[2, 1, 0], :, :]
+
+ img = to_chw_bgr(img)
img = img.astype('float32')
img -= settings.img_mean
img = img * settings.scale
return img, sampled_labels
-def put_txt_in_dict(input_txt):
+def load_file_list(input_txt):
with open(input_txt, 'r') as f_dir:
lines_input_txt = f_dir.readlines()
- dict_input_txt = {}
+ file_dict = {}
num_class = 0
for i in range(len(lines_input_txt)):
- tmp_line_txt = lines_input_txt[i].strip('\n\t\r')
- if '--' in tmp_line_txt:
+ line_txt = lines_input_txt[i].strip('\n\t\r')
+ if '--' in line_txt:
if i != 0:
num_class += 1
- dict_input_txt[num_class] = []
- dict_name = tmp_line_txt
- dict_input_txt[num_class].append(tmp_line_txt)
- if '--' not in tmp_line_txt:
- if len(tmp_line_txt) > 6:
- split_str = tmp_line_txt.split(' ')
+ file_dict[num_class] = []
+ file_dict[num_class].append(line_txt)
+ if '--' not in line_txt:
+ if len(line_txt) > 6:
+ split_str = line_txt.split(' ')
x1_min = float(split_str[0])
y1_min = float(split_str[1])
x2_max = float(split_str[2])
y2_max = float(split_str[3])
- tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
+ line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
x2_max) + ' ' + str(y2_max)
- dict_input_txt[num_class].append(tmp_line_txt)
+ file_dict[num_class].append(line_txt)
else:
- dict_input_txt[num_class].append(tmp_line_txt)
+ file_dict[num_class].append(line_txt)
- return dict_input_txt
+ return file_dict
def expand_bboxes(bboxes,
@@ -238,68 +216,106 @@ def expand_bboxes(bboxes,
return expand_boxes
-def pyramidbox(settings, file_list, mode, shuffle):
-
- dict_input_txt = {}
- dict_input_txt = put_txt_in_dict(file_list)
+def train_generator(settings, file_list, batch_size, shuffle=True):
+ file_dict = load_file_list(file_list)
+ while True:
+ if shuffle:
+ random.shuffle(file_dict)
+ images, face_boxes, head_boxes, label_ids = [], [], [], []
+ label_offs = [0]
- def reader():
- if mode == 'train' and shuffle:
- random.shuffle(dict_input_txt)
- for index_image in range(len(dict_input_txt)):
-
- image_name = dict_input_txt[index_image][0] + '.jpg'
+ for index_image in file_dict.keys():
+ image_name = file_dict[index_image][0]
image_path = os.path.join(settings.data_dir, image_name)
-
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
im_width, im_height = im.size
# layout: label | xmin | ymin | xmax | ymax
- if mode == 'train':
- bbox_labels = []
- for index_box in range(len(dict_input_txt[index_image])):
- if index_box >= 2:
- bbox_sample = []
- temp_info_box = dict_input_txt[index_image][
- index_box].split(' ')
- xmin = float(temp_info_box[0])
- ymin = float(temp_info_box[1])
- w = float(temp_info_box[2])
- h = float(temp_info_box[3])
- xmax = xmin + w
- ymax = ymin + h
-
- bbox_sample.append(1)
- bbox_sample.append(float(xmin) / im_width)
- bbox_sample.append(float(ymin) / im_height)
- bbox_sample.append(float(xmax) / im_width)
- bbox_sample.append(float(ymax) / im_height)
- bbox_labels.append(bbox_sample)
-
- im, sample_labels = preprocess(im, bbox_labels, mode, settings,
- image_path)
- sample_labels = np.array(sample_labels)
- if len(sample_labels) == 0: continue
- im = im.astype('float32')
- boxes = sample_labels[:, 1:5]
- lbls = [1] * len(boxes)
- difficults = [1] * len(boxes)
- yield im, boxes, expand_bboxes(boxes), lbls, difficults
-
- if mode == 'test':
- yield im, image_path
+ bbox_labels = []
+ for index_box in range(len(file_dict[index_image])):
+ if index_box >= 2:
+ bbox_sample = []
+ temp_info_box = file_dict[index_image][index_box].split(' ')
+ xmin = float(temp_info_box[0])
+ ymin = float(temp_info_box[1])
+ w = float(temp_info_box[2])
+ h = float(temp_info_box[3])
+ xmax = xmin + w
+ ymax = ymin + h
+
+ bbox_sample.append(1)
+ bbox_sample.append(float(xmin) / im_width)
+ bbox_sample.append(float(ymin) / im_height)
+ bbox_sample.append(float(xmax) / im_width)
+ bbox_sample.append(float(ymax) / im_height)
+ bbox_labels.append(bbox_sample)
+
+ im, sample_labels = preprocess(im, bbox_labels, "train", settings,
+ image_path)
+ sample_labels = np.array(sample_labels)
+ if len(sample_labels) == 0: continue
+
+ im = im.astype('float32')
+ face_box = sample_labels[:, 1:5]
+ head_box = expand_bboxes(face_box)
+ label = [1] * len(face_box)
+
+ images.append(im)
+ face_boxes.extend(face_box)
+ head_boxes.extend(head_box)
+ label_ids.extend(label)
+ label_offs.append(label_offs[-1] + len(face_box))
+
+ if len(images) == batch_size:
+ images = np.array(images).astype('float32')
+ face_boxes = np.array(face_boxes).astype('float32')
+ head_boxes = np.array(head_boxes).astype('float32')
+ label_ids = np.array(label_ids).astype('int32')
+ yield images, face_boxes, head_boxes, label_ids, label_offs
+ images, face_boxes, head_boxes = [], [], []
+ label_ids, label_offs = [], [0]
+
+
+def train_batch_reader(settings,
+ file_list,
+ batch_size,
+ shuffle=True,
+ num_workers=8):
+ try:
+ enqueuer = GeneratorEnqueuer(
+ train_generator(settings, file_list, batch_size, shuffle),
+ use_multiprocessing=False)
+ enqueuer.start(max_queue_size=24, workers=num_workers)
+ generator_output = None
+ while True:
+ while enqueuer.is_running():
+ if not enqueuer.queue.empty():
+ generator_output = enqueuer.queue.get()
+ break
+ else:
+ time.sleep(0.01)
+ yield generator_output
+ generator_output = None
+ finally:
+ if enqueuer is not None:
+ enqueuer.stop()
- return reader
+def test(settings, file_list):
+ file_dict = load_file_list(file_list)
-def train(settings, file_list, shuffle=True):
- return pyramidbox(settings, file_list, 'train', shuffle)
-
+ def reader():
+ for index_image in file_dict.keys():
+ image_name = file_dict[index_image][0]
+ image_path = os.path.join(settings.data_dir, image_name)
+ im = Image.open(image_path)
+ if im.mode == 'L':
+ im = im.convert('RGB')
+ yield im, image_path
-def test(settings, file_list):
- return pyramidbox(settings, file_list, 'test', False)
+ return reader
def infer(settings, image_path):
@@ -312,12 +328,7 @@ def infer(settings, image_path):
img = img.resize((settings.resize_width, settings.resize_height),
Image.ANTIALIAS)
img = np.array(img)
- # HWC to CHW
- if len(img.shape) == 3:
- img = np.swapaxes(img, 1, 2)
- img = np.swapaxes(img, 1, 0)
- # RBG to BGR
- img = img[[2, 1, 0], :, :]
+ img = to_chw_bgr(img)
img = img.astype('float32')
img -= settings.img_mean
img = img * settings.scale
diff --git a/fluid/face_detection/train.py b/fluid/face_detection/train.py
index acff16ecc354ac625699596e75b2db2c8f164a95..b62ac26d0d7236421e80ed4396c6ed3d0f72c310 100644
--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -5,27 +5,26 @@ import time
import argparse
import functools
-import reader
-import paddle
import paddle.fluid as fluid
from pyramidbox import PyramidBox
+import reader
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
-add_arg('parallel', bool, True, "parallel")
-add_arg('learning_rate', float, 0.001, "Learning rate.")
-add_arg('batch_size', int, 12, "Minibatch size.")
+add_arg('parallel', bool, True, "Whether use multi-GPU/threads or not.")
+add_arg('learning_rate', float, 0.001, "The start learning rate.")
+add_arg('batch_size', int, 16, "Minibatch size.")
add_arg('num_passes', int, 160, "Epoch number.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.")
-add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.")
-add_arg('resize_w', int, 640, "The resized image height.")
-add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.")
+add_arg('resize_w', int, 640, "The resized image width.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str, './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
#yapf: enable
@@ -58,8 +57,9 @@ def train(args, config, train_file_list, optimizer_method):
loss = network.vgg_ssd_loss()
fetches = [loss]
- epocs = 12880 / batch_size
- boundaries = [epocs * 50, epocs * 80, epocs * 120, epocs * 140]
+ steps_per_pass = 12880 / batch_size
+ boundaries = [steps_per_pass * 50, steps_per_pass * 80,
+ steps_per_pass * 120, steps_per_pass * 140]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01
@@ -104,9 +104,7 @@ def train(args, config, train_file_list, optimizer_method):
train_exe = fluid.ParallelExecutor(
use_cuda=use_gpu, loss_name=loss.name)
- train_reader = paddle.batch(
- reader.train(config, train_file_list), batch_size=batch_size)
- feeder = fluid.DataFeeder(place=place, feed_list=network.feeds())
+ train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
def save_model(postfix):
model_path = os.path.join(model_save_dir, postfix)
@@ -115,24 +113,38 @@ def train(args, config, train_file_list, optimizer_method):
print 'save models to %s' % (model_path)
fluid.io.save_persistables(exe, model_path)
+ def tensor(data, place, lod=None):
+ t = fluid.core.LoDTensor()
+ t.set(data, place)
+ if lod:
+ t.set_lod(lod)
+ return t
+
for pass_id in range(start_pass, num_passes):
start_time = time.time()
prev_start_time = start_time
end_time = 0
- for batch_id, data in enumerate(train_reader()):
+ for batch_id in range(steps_per_pass):
+ im, face_box, head_box, labels, lod = next(train_reader)
+ im_t = tensor(im, place)
+ box1 = tensor(face_box, place, [lod])
+ box2 = tensor(head_box, place, [lod])
+ lbl_t = tensor(labels, place, [lod])
+ feeding = {'image': im_t, 'face_box': box1,
+ 'head_box': box2, 'gt_label': lbl_t}
+
prev_start_time = start_time
start_time = time.time()
- if len(data) < 2 * devices_num: continue
if args.parallel:
fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
- feed=feeder.feed(data))
+ feed=feeding)
else:
fetch_vars = exe.run(fluid.default_main_program(),
- feed=feeder.feed(data),
+ feed=feeding,
fetch_list=fetches)
end_time = time.time()
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
- if batch_id % 1 == 0:
+ if batch_id % 10 == 0:
if not args.use_pyramidbox:
print("Pass {0}, batch {1}, loss {2}, time {3}".format(
pass_id, batch_id, fetch_vars[0],
@@ -151,8 +163,8 @@ if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
- data_dir = 'data/WIDERFACE/WIDER_train/images/'
- train_file_list = 'label/train_gt_widerface.res'
+ data_dir = 'data/WIDER_train/images/'
+ train_file_list = 'data/wider_face_split/wider_face_train_bbx_gt.txt'
config = reader.Settings(
data_dir=data_dir,
diff --git a/fluid/face_detection/visualize.py b/fluid/face_detection/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..418ef533cf9f89dfe3526583f76f2228583e378a
--- /dev/null
+++ b/fluid/face_detection/visualize.py
@@ -0,0 +1,54 @@
+import os
+from PIL import Image
+from PIL import ImageDraw
+
+
+def draw_bbox(image, bbox):
+ """
+ Draw one bounding box on image.
+ Args:
+ image (PIL.Image): a PIL Image object.
+ bbox (np.array|list|tuple): (xmin, ymin, xmax, ymax).
+ """
+ draw = ImageDraw.Draw(image)
+ xmin, ymin, xmax, ymax = box
+ (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+ draw.line(
+ [(left, top), (left, bottom), (right, bottom), (right, top),
+ (left, top)],
+ width=4,
+ fill='red')
+
+
+def draw_bboxes(image_file, bboxes, labels=None, output_dir=None):
+ """
+ Draw bounding boxes on image.
+
+ Args:
+ image_file (string): input image path.
+ bboxes (np.array): bounding boxes.
+ labels (list of string): the label names of bboxes.
+ output_dir (string): output directory.
+ """
+ if labels:
+ assert len(bboxes) == len(labels)
+
+ image = Image.open(image_file)
+ draw = ImageDraw.Draw(image)
+ for i in range(len(bboxes)):
+ xmin, ymin, xmax, ymax = bboxes[i]
+ (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+ draw.line(
+ [(left, top), (left, bottom), (right, bottom), (right, top),
+ (left, top)],
+ width=4,
+ fill='red')
+ if labels and image.mode == 'RGB':
+ draw.text((left, top), labels[i], (255, 255, 0))
+
+ output_file = image_file.split('/')[-1]
+ if output_dir:
+ output_file = os.path.join(output_dir, output_file)
+
+ print("The image with bbox is saved as {}".format(output_file))
+ image.save(output_file)
diff --git a/fluid/face_detection/infer.py b/fluid/face_detection/widerface_eval.py
similarity index 53%
rename from fluid/face_detection/infer.py
rename to fluid/face_detection/widerface_eval.py
index a9468c33c110e04c82c9845414e1d83fee0bb7a7..72be5fa64d3ae96ca5f4933bca6036c05c2c6e5b 100644
--- a/fluid/face_detection/infer.py
+++ b/fluid/face_detection/widerface_eval.py
@@ -4,68 +4,130 @@ import numpy as np
import argparse
import functools
from PIL import Image
-from PIL import ImageDraw
-import paddle
import paddle.fluid as fluid
import reader
from pyramidbox import PyramidBox
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
+
# yapf: disable
-add_arg('use_gpu', bool, True, "Whether use GPU.")
-add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
-add_arg('confs_threshold', float, 0.25, "Confidence threshold to draw bbox.")
-add_arg('image_path', str, '', "The data root path.")
-add_arg('model_dir', str, '', "The model path.")
+add_arg('use_gpu', bool, True, "Whether use GPU or not.")
+add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
+add_arg('data_dir', str, 'data/WIDER_val/images/', "The validation dataset path.")
+add_arg('model_dir', str, '', "The model path.")
+add_arg('pred_dir', str, 'pred', "The path to save the evaluation results.")
+add_arg('file_list', str, 'data/wider_face_split/wider_face_val_bbx_gt.txt', "The validation dataset path.")
# yapf: enable
-def draw_bounding_box_on_image(image_path, nms_out, confs_threshold):
- image = Image.open(image_path)
- draw = ImageDraw.Draw(image)
- for dt in nms_out:
- xmin, ymin, xmax, ymax, score = dt
- if score < confs_threshold:
- continue
- (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
- draw.line(
- [(left, top), (left, bottom), (right, bottom), (right, top),
- (left, top)],
- width=4,
- fill='red')
- image_name = image_path.split('/')[-1]
- image_class = image_path.split('/')[-2]
- print("image with bbox drawed saved as {}".format(image_name))
- image.save('./infer_results/' + image_class.encode('utf-8') + '/' +
- image_name.encode('utf-8'))
+def infer(args, config):
+ batch_size = 1
+ model_dir = args.model_dir
+ data_dir = args.data_dir
+ file_list = args.file_list
+ pred_dir = args.pred_dir
+
+ if not os.path.exists(model_dir):
+ raise ValueError("The model path [%s] does not exist." % (model_dir))
+
+ test_reader = reader.test(config, file_list)
+
+ for image, image_path in test_reader():
+ shrink, max_shrink = get_shrink(image.size[1], image.size[0])
+
+ det0 = detect_face(image, shrink)
+ det1 = flip_test(image, shrink)
+ [det2, det3] = multi_scale_test(image, max_shrink)
+ det4 = multi_scale_test_pyramid(image, max_shrink)
+ det = np.row_stack((det0, det1, det2, det3, det4))
+ dets = bbox_vote(det)
+
+ save_widerface_bboxes(image_path, dets, pred_dir)
+ print("Finish evaluation.")
-def write_to_txt(image_path, f, nms_out):
+
+def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
+ """
+ Save predicted results, including bbox and score into text file.
+ Args:
+ image_path (string): file name.
+ bboxes_scores (np.array|list): the predicted bboxed and scores, layout
+ is (xmin, ymin, xmax, ymax, score)
+ output_dir (string): output directory.
+ """
image_name = image_path.split('/')[-1]
image_class = image_path.split('/')[-2]
- f.write('{:s}\n'.format(
- image_class.encode('utf-8') + '/' + image_name.encode('utf-8')))
- f.write('{:d}\n'.format(nms_out.shape[0]))
- for dt in nms_out:
- xmin, ymin, xmax, ymax, score = dt
+
+ image_name = image_name.encode('utf-8')
+ image_class = image_class.encode('utf-8')
+
+ odir = os.path.join(output_dir, image_class)
+ if not os.path.exists(odir):
+ os.makedirs(odir)
+
+ ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
+ f = open(ofname, 'w')
+ f.write('{:s}\n'.format(image_class + '/' + image_name))
+ f.write('{:d}\n'.format(bboxes_scores.shape[0]))
+ for box_score in bboxes_scores:
+ xmin, ymin, xmax, ymax, score = box_score
f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
xmax - xmin + 1), (ymax - ymin + 1), score))
- print("image infer result saved {}".format(image_name[:-4]))
+ f.close()
+ print("The predicted result is saved as {}".format(ofname))
+
+
+def detect_face(image, shrink):
+ image_shape = [3, image.size[1], image.size[0]]
+ if shrink != 1:
+ h, w = int(image_shape[1] * shrink), int(image_shape[2] * shrink)
+ image = image.resize((w, h), Image.ANTIALIAS)
+ image_shape = [3, h, w]
+ img = np.array(image)
+ img = reader.to_chw_bgr(img)
+ mean = [104., 117., 123.]
+ scale = 0.007843
+ img = img.astype('float32')
+ img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+ img = img * scale
+ img = [img]
+ img = np.array(img)
+
+ place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+ exe = fluid.Executor(place)
+ main_program = fluid.Program()
+ startup_program = fluid.Program()
+
+ with fluid.unique_name.guard():
+ with fluid.program_guard(main_program, startup_program):
+ network = PyramidBox(
+ image_shape, sub_network=args.use_pyramidbox, is_infer=True)
+ infer_program, nmsed_out = network.infer(main_program)
+ fetches = [nmsed_out]
+ fluid.io.load_persistables(
+ exe, args.model_dir, main_program=main_program)
+
+ detection, = exe.run(infer_program,
+ feed={'image': img},
+ fetch_list=fetches,
+ return_numpy=False)
+ detection = np.array(detection)
+ # layout: xmin, ymin, xmax. ymax, score
+ if detection.shape == (1, ):
+ print("No face detected")
+ return np.array([[0, 0, 0, 0, 0]])
+ det_conf = detection[:, 1]
+ det_xmin = image_shape[2] * detection[:, 2] / shrink
+ det_ymin = image_shape[1] * detection[:, 3] / shrink
+ det_xmax = image_shape[2] * detection[:, 4] / shrink
+ det_ymax = image_shape[1] * detection[:, 5] / shrink
-def get_round(x, loc):
- str_x = str(x)
- if '.' in str_x:
- len_after = len(str_x.split('.')[1])
- str_before = str_x.split('.')[0]
- str_after = str_x.split('.')[1]
- if len_after >= 3:
- str_final = str_before + '.' + str_after[0:loc]
- return float(str_final)
- else:
- return x
+ det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+ return det
def bbox_vote(det):
@@ -86,7 +148,7 @@ def bbox_vote(det):
inter = w * h
o = inter / (area[0] + area[:] - inter)
- # get needed merge det and delete these det
+ # nms
merge_index = np.where(o >= 0.3)[0]
det_accu = det[merge_index, :]
det = np.delete(det, merge_index, 0)
@@ -111,78 +173,6 @@ def bbox_vote(det):
return dets
-def image_preprocess(image):
- img = np.array(image)
- # HWC to CHW
- if len(img.shape) == 3:
- img = np.swapaxes(img, 1, 2)
- img = np.swapaxes(img, 1, 0)
- # RBG to BGR
- img = img[[2, 1, 0], :, :]
- img = img.astype('float32')
- img -= np.array(
- [104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
- img = img * 0.007843
- img = [img]
- img = np.array(img)
- return img
-
-
-def detect_face(image, shrink):
- image_shape = [3, image.size[1], image.size[0]]
- num_classes = 2
- place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
- exe = fluid.Executor(place)
-
- if shrink != 1:
- image = image.resize((int(image_shape[2] * shrink),
- int(image_shape[1] * shrink)), Image.ANTIALIAS)
- image_shape = [
- image_shape[0], int(image_shape[1] * shrink),
- int(image_shape[2] * shrink)
- ]
- print "image_shape:", image_shape
- img = image_preprocess(image)
-
- scope = fluid.core.Scope()
- main_program = fluid.Program()
- startup_program = fluid.Program()
-
- with fluid.scope_guard(scope):
- with fluid.unique_name.guard():
- with fluid.program_guard(main_program, startup_program):
- fetches = []
- network = PyramidBox(
- image_shape,
- num_classes,
- sub_network=args.use_pyramidbox,
- is_infer=True)
- infer_program, nmsed_out = network.infer(main_program)
- fetches = [nmsed_out]
- fluid.io.load_persistables(
- exe, args.model_dir, main_program=main_program)
-
- detection, = exe.run(infer_program,
- feed={'image': img},
- fetch_list=fetches,
- return_numpy=False)
- detection = np.array(detection)
- # layout: xmin, ymin, xmax. ymax, score
- if detection.shape == (1, ):
- print("No face detected")
- return np.array([[0, 0, 0, 0, 0]])
- det_conf = detection[:, 1]
- det_xmin = image_shape[2] * detection[:, 2] / shrink
- det_ymin = image_shape[1] * detection[:, 3] / shrink
- det_xmax = image_shape[2] * detection[:, 4] / shrink
- det_ymax = image_shape[1] * detection[:, 5] / shrink
-
- det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
- keep_index = np.where(det[:, 4] >= 0)[0]
- det = det[keep_index, :]
- return det
-
-
def flip_test(image, shrink):
img = image.transpose(Image.FLIP_LEFT_RIGHT)
det_f = detect_face(img, shrink)
@@ -197,18 +187,18 @@ def flip_test(image, shrink):
def multi_scale_test(image, max_shrink):
- # shrink detecting and shrink only detect big face
+ # Shrink detecting is only used to detect big faces
st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
det_s = detect_face(image, st)
index = np.where(
np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
> 30)[0]
det_s = det_s[index, :]
- # enlarge one times
+ # Enlarge one times
bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
det_b = detect_face(image, bt)
- # enlarge small image x times for small face
+ # Enlarge small image x times for small faces
if max_shrink > 2:
bt *= 2
while bt < max_shrink:
@@ -216,12 +206,13 @@ def multi_scale_test(image, max_shrink):
bt *= 2
det_b = np.row_stack((det_b, detect_face(image, max_shrink)))
- # enlarge only detect small face
+ # Enlarged images are only used to detect small faces.
if bt > 1:
index = np.where(
np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
det_b = det_b[index, :]
+ # Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
@@ -231,23 +222,24 @@ def multi_scale_test(image, max_shrink):
def multi_scale_test_pyramid(image, max_shrink):
- # shrink detecting and shrink only detect big face
+ # Use image pyramids to detect faces
det_b = detect_face(image, 0.25)
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
> 30)[0]
det_b = det_b[index, :]
- st = [0.5, 0.75, 1.25, 1.5, 1.75, 2.25]
+ st = [0.75, 1.25, 1.5, 1.75]
for i in range(len(st)):
if (st[i] <= max_shrink):
det_temp = detect_face(image, st[i])
- # enlarge only detect small face
+ # Enlarged images are only used to detect small faces.
if st[i] > 1:
index = np.where(
np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
det_temp = det_temp[index, :]
+ # Shrinked images are only used to detect big faces.
else:
index = np.where(
np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
@@ -257,13 +249,28 @@ def multi_scale_test_pyramid(image, max_shrink):
return det_b
-def get_im_shrink(image_shape):
- max_shrink_v1 = (0x7fffffff / 577.0 /
- (image_shape[1] * image_shape[2]))**0.5
- max_shrink_v2 = (
- (678 * 1024 * 2.0 * 2.0) / (image_shape[1] * image_shape[2]))**0.5
- max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+def get_shrink(height, width):
+ """
+ Args:
+ height (int): image height.
+ width (int): image width.
+ """
+ # avoid out of memory
+ max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
+ max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
+
+ def get_round(x, loc):
+ str_x = str(x)
+ if '.' in str_x:
+ str_before, str_after = str_x.split('.')
+ len_after = len(str_after)
+ if len_after >= 3:
+ str_final = str_before + '.' + str_after[0:loc]
+ return float(str_final)
+ else:
+ return x
+ max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
if max_shrink >= 1.5 and max_shrink < 2:
max_shrink = max_shrink - 0.1
elif max_shrink >= 2 and max_shrink < 3:
@@ -275,60 +282,12 @@ def get_im_shrink(image_shape):
elif max_shrink >= 5:
max_shrink = max_shrink - 0.5
- print 'max_shrink = ', max_shrink
shrink = max_shrink if max_shrink < 1 else 1
- print "shrink = ", shrink
-
return shrink, max_shrink
-def infer(args, batch_size, data_args):
- if not os.path.exists(args.model_dir):
- raise ValueError("The model path [%s] does not exist." %
- (args.model_dir))
-
- infer_reader = paddle.batch(
- reader.test(data_args, file_list), batch_size=batch_size)
-
- for batch_id, img in enumerate(infer_reader()):
- image = img[0][0]
- image_path = img[0][1]
-
- # image.size: [width, height]
- image_shape = [3, image.size[1], image.size[0]]
-
- shrink, max_shrink = get_im_shrink(image_shape)
-
- det0 = detect_face(image, shrink)
- det1 = flip_test(image, shrink)
- [det2, det3] = multi_scale_test(image, max_shrink)
- det4 = multi_scale_test_pyramid(image, max_shrink)
- det = np.row_stack((det0, det1, det2, det3, det4))
- dets = bbox_vote(det)
-
- image_name = image_path.split('/')[-1]
- image_class = image_path.split('/')[-2]
- if not os.path.exists('./infer_results/' + image_class.encode('utf-8')):
- os.makedirs('./infer_results/' + image_class.encode('utf-8'))
-
- f = open('./infer_results/' + image_class.encode('utf-8') + '/' +
- image_name.encode('utf-8')[:-4] + '.txt', 'w')
- write_to_txt(image_path, f, dets)
- # draw_bounding_box_on_image(image_path, dets, args.confs_threshold)
- print "Done"
-
-
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
-
- data_dir = 'data/WIDERFACE/WIDER_val/images/'
- file_list = 'label/val_gt_widerface.res'
-
- data_args = reader.Settings(
- data_dir=data_dir,
- mean_value=[104., 117., 123],
- apply_distort=False,
- apply_expand=False,
- ap_version='11point')
- infer(args, batch_size=1, data_args=data_args)
+ config = reader.Settings(data_dir=args.data_dir)
+ infer(args, config)
diff --git a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
index 947b8900bd944759437a55c20fb32bca4a1b9380..3e6e0ce6d6df0b8c5a5e7814e510eb64006ce34d 100644
--- a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
+++ b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
@@ -34,7 +34,7 @@ tar xf ${valid_tar} -C ${valid_folder}
echo "Download imagenet label file: val_list.txt & train_list.txt"
label_file=ImageNet_label.tgz
-label_url=http://imagenet-data.bj.bcebos.com/${label_file}
+label_url=http://paddle-imagenet-models.bj.bcebos.com/${label_file}
wget -nd -c ${label_url}
tar zxf ${label_file}
diff --git a/fluid/neural_machine_translation/transformer/README_cn.md b/fluid/neural_machine_translation/transformer/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..547b525b40abbfc3009e3948273db52ff394e535
--- /dev/null
+++ b/fluid/neural_machine_translation/transformer/README_cn.md
@@ -0,0 +1,163 @@
+运行本目录下的程序示例需要使用 PaddlePaddle 最新的 develop branch 版本。如果您的 PaddlePaddle 安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新 PaddlePaddle 安装版本。
+
+---
+
+## Transformer
+
+以下是本例的简要目录结构及说明:
+
+```text
+.
+├── images # README 文档中的图片
+├── optim.py # learning rate scheduling 计算程序
+├── infer.py # 预测脚本
+├── model.py # 模型定义
+├── reader.py # 数据读取接口
+├── README.md # 文档
+├── train.py # 训练脚本
+└── config.py # 训练、预测以及模型参数配置
+```
+
+### 简介
+
+Transformer 是论文 [Attention Is All You Need](https://arxiv.org/abs/1706.03762) 中提出的用以完成机器翻译(machine translation, MT)等序列到序列(sequence to sequence, Seq2Seq)学习任务的一种全新网络结构,其完全使用注意力(Attention)机制来实现序列到序列的建模[1]。
+
+相较于此前 Seq2Seq 模型中广泛使用的循环神经网络(Recurrent Neural Network, RNN),使用(Self)Attention 进行输入序列到输出序列的变换主要具有以下优势:
+
+- 计算复杂度小
+ - 特征维度为 d 、长度为 n 的序列,在 RNN 中计算复杂度为 `O(n * d * d)` (n 个时间步,每个时间步计算 d 维的矩阵向量乘法),在 Self-Attention 中计算复杂度为 `O(n * n * d)` (n 个时间步两两计算 d 维的向量点积或其他相关度函数),n 通常要小于 d 。
+- 计算并行度高
+ - RNN 中当前时间步的计算要依赖前一个时间步的计算结果;Self-Attention 中各时间步的计算只依赖输入不依赖之前时间步输出,各时间步可以完全并行。
+- 容易学习长程依赖(long-range dependencies)
+ - RNN 中相距为 n 的两个位置间的关联需要 n 步才能建立;Self-Attention 中任何两个位置都直接相连;路径越短信号传播越容易。
+
+这些也在机器翻译任务中得到了印证,Transformer 模型在训练时间大幅减少的同时取得了 WMT'14 英德翻译任务 BLEU 值的新高。此外,Transformer 在应用于成分句法分析(Constituency Parsing)任务时也有着不俗的表现,这也说明其具有较高的通用性,容易迁移到其他应用场景中。这些都表明 Transformer 有着广阔的前景。
+
+### 模型概览
+
+Transformer 同样使用了 Seq2Seq 模型中典型的编码器-解码器(Encoder-Decoder)的框架结构,整体网络结构如图1所示。
+
+
+
+图 1. Transformer 网络结构图
+
+
+图 2. Multi-Head Attention
+