提交 f4d6e3e7 编写于 作者: Y Yibing Liu

Merge learning rate decay

......@@ -207,7 +207,7 @@ class AsyncDataReader(object):
feature_file_list,
label_file_list="",
drop_frame_len=512,
split_sentence_threshold=512,
split_sentence_threshold=1024,
proc_num=10,
sample_buffer_size=1024,
sample_info_buffer_size=1024,
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0
python -u ../../tools/profile.py --feature_lst data/train_feature.lst \
--label_lst data/train_label.lst \
--mean_var data/aishell/global_mean_var \
--parallel \
--mean_var data/global_mean_var \
--frame_dim 80 \
--class_num 3040 \
--batch_size 16
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=4,5,6,7
python -u ../../train.py --train_feature_lst data/train_feature.lst \
--train_label_lst data/train_label.lst \
--val_feature_lst data/val_feature.lst \
--val_label_lst data/val_label.lst \
--mean_var data/aishell/global_mean_var \
--mean_var data/global_mean_var \
--checkpoints checkpoints \
--frame_dim 80 \
--class_num 3040 \
......@@ -11,4 +11,3 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \
--batch_size 64 \
--learning_rate 6.4e-5 \
--parallel
~
......@@ -187,7 +187,12 @@ def infer_from_ckpt(args):
infer_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
......
......@@ -137,7 +137,12 @@ def profile(args):
class_num=args.class_num,
parallel=args.parallel)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
......@@ -150,7 +155,8 @@ def profile(args):
trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5)
]
data_reader = reader.AsyncDataReader(args.feature_lst, args.label_lst, -1)
data_reader = reader.AsyncDataReader(
args.feature_lst, args.label_lst, -1, split_sentence_threshold=1024)
data_reader.set_transformers(ltrans)
feature_t = fluid.LoDTensor()
......
......@@ -159,7 +159,12 @@ def train(args):
test_program = fluid.default_main_program().clone()
#optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=fluid.layers.exponential_decay(
learning_rate=args.learning_rate,
decay_steps=1879,
decay_rate=1 / 1.2,
staircase=True))
optimizer.minimize(avg_cost)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
......@@ -186,8 +191,11 @@ def train(args):
os.path.exists(args.val_label_lst)):
return -1.0, -1.0
# test data reader
test_data_reader = reader.AsyncDataReader(args.val_feature_lst,
args.val_label_lst, -1)
test_data_reader = reader.AsyncDataReader(
args.val_feature_lst,
args.val_label_lst,
-1,
split_sentence_threshold=1024)
test_data_reader.set_transformers(ltrans)
test_costs, test_accs = [], []
for batch_id, batch_data in enumerate(
......@@ -212,8 +220,11 @@ def train(args):
return np.mean(test_costs), np.mean(test_accs)
# train data reader
train_data_reader = reader.AsyncDataReader(args.train_feature_lst,
args.train_label_lst, -1)
train_data_reader = reader.AsyncDataReader(
args.train_feature_lst,
args.train_label_lst,
-1,
split_sentence_threshold=1024)
train_data_reader.set_transformers(ltrans)
# train
......
......@@ -4,4 +4,6 @@ data/
label/
*.swp
*.log
infer_results/
log*
output*
infer_results*
"""
This code is based on https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
"""
import time
import numpy as np
import threading
import multiprocessing
try:
import queue
except ImportError:
import Queue as queue
class GeneratorEnqueuer(object):
"""
Builds a queue out of a data generator.
Args:
generator: a generator function which endlessly yields data
use_multiprocessing (bool): use multiprocessing if True,
otherwise use threading.
wait_time (float): time to sleep in-between calls to `put()`.
random_seed (int): Initial seed for workers,
will be incremented by one for each workers.
"""
def __init__(self,
generator,
use_multiprocessing=False,
wait_time=0.05,
random_seed=None):
self.wait_time = wait_time
self._generator = generator
self._use_multiprocessing = use_multiprocessing
self._threads = []
self._stop_event = None
self.queue = None
self._manager = None
self.seed = random_seed
def start(self, workers=1, max_queue_size=10):
"""
Start worker threads which add data from the generator into the queue.
Args:
workers (int): number of worker threads
max_queue_size (int): queue size
(when full, threads could block on `put()`)
"""
def data_generator_task():
"""
Data generator task.
"""
def task():
if (self.queue is not None and
self.queue.qsize() < max_queue_size):
generator_output = next(self._generator)
self.queue.put((generator_output))
else:
time.sleep(self.wait_time)
if not self._use_multiprocessing:
while not self._stop_event.is_set():
with self.genlock:
try:
task()
except Exception:
self._stop_event.set()
break
else:
while not self._stop_event.is_set():
try:
task()
except Exception:
self._stop_event.set()
break
try:
if self._use_multiprocessing:
self._manager = multiprocessing.Manager()
self.queue = self._manager.Queue(maxsize=max_queue_size)
self._stop_event = multiprocessing.Event()
else:
self.genlock = threading.Lock()
self.queue = queue.Queue()
self._stop_event = threading.Event()
for _ in range(workers):
if self._use_multiprocessing:
# Reset random seed else all children processes
# share the same seed
np.random.seed(self.seed)
thread = multiprocessing.Process(target=data_generator_task)
thread.daemon = True
if self.seed is not None:
self.seed += 1
else:
thread = threading.Thread(target=data_generator_task)
self._threads.append(thread)
thread.start()
except:
self.stop()
raise
def is_running(self):
"""
Returns:
bool: Whether the worker theads are running.
"""
return self._stop_event is not None and not self._stop_event.is_set()
def stop(self, timeout=None):
"""
Stops running threads and wait for them to exit, if necessary.
Should be called by the same thread which called `start()`.
Args:
timeout(int|None): maximum time to wait on `thread.join()`.
"""
if self.is_running():
self._stop_event.set()
for thread in self._threads:
if self._use_multiprocessing:
if thread.is_alive():
thread.terminate()
else:
thread.join(timeout)
if self._manager:
self._manager.shutdown()
self._threads = []
self._stop_event = None
self.queue = None
def get(self):
"""
Creates a generator to extract data from the queue.
Skip the data if it is `None`.
# Yields
tuple of data in the queue.
"""
while self.is_running():
if not self.queue.empty():
inputs = self.queue.get()
if inputs is not None:
yield inputs
else:
time.sleep(self.wait_time)
......@@ -3,6 +3,7 @@ from PIL import ImageFile
import numpy as np
import random
import math
import cv2
ImageFile.LOAD_TRUNCATED_IMAGES = True #otherwise IOError raised image file is truncated
......@@ -100,6 +101,76 @@ def generate_sample(sampler, image_width, image_height):
return sampled_bbox
def data_anchor_sampling(sampler, bbox_labels, image_width, image_height,
scale_array, resize_width, resize_height):
num_gt = len(bbox_labels)
# np.random.randint range: [low, high)
rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
if num_gt != 0:
norm_xmin = bbox_labels[rand_idx][1]
norm_ymin = bbox_labels[rand_idx][2]
norm_xmax = bbox_labels[rand_idx][3]
norm_ymax = bbox_labels[rand_idx][4]
xmin = norm_xmin * image_width
ymin = norm_ymin * image_height
wid = image_width * (norm_xmax - norm_xmin)
hei = image_height * (norm_ymax - norm_ymin)
range_size = 0
for scale_ind in range(0, len(scale_array) - 1):
area = wid * hei
if area > scale_array[scale_ind] ** 2 and area < \
scale_array[scale_ind + 1] ** 2:
range_size = scale_ind + 1
break
scale_choose = 0.0
if range_size == 0:
rand_idx_size = range_size + 1
else:
# np.random.randint range: [low, high)
rng_rand_size = np.random.randint(0, range_size)
rand_idx_size = rng_rand_size % range_size
scale_choose = random.uniform(scale_array[rand_idx_size] / 2.0,
2.0 * scale_array[rand_idx_size])
sample_bbox_size = wid * resize_width / scale_choose
w_off_orig = 0.0
h_off_orig = 0.0
if sample_bbox_size < max(image_height, image_width):
if wid <= sample_bbox_size:
w_off_orig = random.uniform(xmin + wid - sample_bbox_size, xmin)
else:
w_off_orig = random.uniform(xmin, xmin + wid - sample_bbox_size)
if hei <= sample_bbox_size:
h_off_orig = random.uniform(ymin + hei - sample_bbox_size, ymin)
else:
h_off_orig = random.uniform(ymin, ymin + hei - sample_bbox_size)
else:
w_off_orig = random.uniform(image_width - sample_bbox_size, 0.0)
h_off_orig = random.uniform(image_height - sample_bbox_size, 0.0)
w_off_orig = math.floor(w_off_orig)
h_off_orig = math.floor(h_off_orig)
# Figure out top left coordinates.
w_off = 0.0
h_off = 0.0
w_off = float(w_off_orig / image_width)
h_off = float(h_off_orig / image_height)
sampled_bbox = bbox(w_off, h_off,
w_off + float(sample_bbox_size / image_width),
h_off + float(sample_bbox_size / image_height))
return sampled_bbox
def jaccard_overlap(sample_bbox, object_bbox):
if sample_bbox.xmin >= object_bbox.xmax or \
sample_bbox.xmax <= object_bbox.xmin or \
......@@ -161,8 +232,6 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
def generate_batch_samples(batch_sampler, bbox_labels, image_width,
image_height):
sampled_bbox = []
index = []
c = 0
for sampler in batch_sampler:
found = 0
for i in range(sampler.max_trial):
......@@ -172,8 +241,24 @@ def generate_batch_samples(batch_sampler, bbox_labels, image_width,
if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
sampled_bbox.append(sample_bbox)
found = found + 1
index.append(c)
c = c + 1
return sampled_bbox
def generate_batch_random_samples(batch_sampler, bbox_labels, image_width,
image_height, scale_array, resize_width,
resize_height):
sampled_bbox = []
for sampler in batch_sampler:
found = 0
for i in range(sampler.max_trial):
if found >= sampler.max_sample:
break
sample_bbox = data_anchor_sampling(
sampler, bbox_labels, image_width, image_height, scale_array,
resize_width, resize_height)
if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
sampled_bbox.append(sample_bbox)
found = found + 1
return sampled_bbox
......@@ -237,48 +322,117 @@ def transform_labels(bbox_labels, sample_bbox):
return sample_labels
def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
def transform_labels_sampling(bbox_labels, sample_bbox, resize_val,
min_face_size):
sample_labels = []
for i in range(len(bbox_labels)):
sample_label = []
object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
bbox_labels[i][3], bbox_labels[i][4])
if not meet_emit_constraint(object_bbox, sample_bbox):
continue
proj_bbox = project_bbox(object_bbox, sample_bbox)
if proj_bbox:
real_width = float((proj_bbox.xmax - proj_bbox.xmin) * resize_val)
real_height = float((proj_bbox.ymax - proj_bbox.ymin) * resize_val)
if real_width * real_height < float(min_face_size * min_face_size):
continue
else:
sample_label.append(bbox_labels[i][0])
sample_label.append(float(proj_bbox.xmin))
sample_label.append(float(proj_bbox.ymin))
sample_label.append(float(proj_bbox.xmax))
sample_label.append(float(proj_bbox.ymax))
sample_label = sample_label + bbox_labels[i][5:]
sample_labels.append(sample_label)
return sample_labels
def crop_image(img, bbox_labels, sample_bbox, image_width, image_height,
resize_width, resize_height, min_face_size):
sample_bbox = clip_bbox(sample_bbox)
xmin = int(sample_bbox.xmin * image_width)
xmax = int(sample_bbox.xmax * image_width)
ymin = int(sample_bbox.ymin * image_height)
ymax = int(sample_bbox.ymax * image_height)
sample_img = img[ymin:ymax, xmin:xmax]
sample_labels = transform_labels(bbox_labels, sample_bbox)
resize_val = resize_width
sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
resize_val, min_face_size)
return sample_img, sample_labels
def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
image_height, resize_width, resize_height,
min_face_size):
# no clipping here
xmin = int(sample_bbox.xmin * image_width)
xmax = int(sample_bbox.xmax * image_width)
ymin = int(sample_bbox.ymin * image_height)
ymax = int(sample_bbox.ymax * image_height)
w_off = xmin
h_off = ymin
width = xmax - xmin
height = ymax - ymin
cross_xmin = max(0.0, float(w_off))
cross_ymin = max(0.0, float(h_off))
cross_xmax = min(float(w_off + width - 1.0), float(image_width))
cross_ymax = min(float(h_off + height - 1.0), float(image_height))
cross_width = cross_xmax - cross_xmin
cross_height = cross_ymax - cross_ymin
roi_xmin = 0 if w_off >= 0 else abs(w_off)
roi_ymin = 0 if h_off >= 0 else abs(h_off)
roi_width = cross_width
roi_height = cross_height
sample_img = np.zeros((height, width, 3))
sample_img[int(roi_ymin) : int(roi_ymin + roi_height), int(roi_xmin) : int(roi_xmin + roi_width)] = \
img[int(cross_ymin) : int(cross_ymin + cross_height), int(cross_xmin) : int(cross_xmin + cross_width)]
sample_img = cv2.resize(
sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA)
resize_val = resize_width
sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
resize_val, min_face_size)
return sample_img, sample_labels
def random_brightness(img, settings):
prob = random.uniform(0, 1)
if prob < settings._brightness_prob:
delta = random.uniform(-settings._brightness_delta,
settings._brightness_delta) + 1
if prob < settings.brightness_prob:
delta = random.uniform(-settings.brightness_delta,
settings.brightness_delta) + 1
img = ImageEnhance.Brightness(img).enhance(delta)
return img
def random_contrast(img, settings):
prob = random.uniform(0, 1)
if prob < settings._contrast_prob:
delta = random.uniform(-settings._contrast_delta,
settings._contrast_delta) + 1
if prob < settings.contrast_prob:
delta = random.uniform(-settings.contrast_delta,
settings.contrast_delta) + 1
img = ImageEnhance.Contrast(img).enhance(delta)
return img
def random_saturation(img, settings):
prob = random.uniform(0, 1)
if prob < settings._saturation_prob:
delta = random.uniform(-settings._saturation_delta,
settings._saturation_delta) + 1
if prob < settings.saturation_prob:
delta = random.uniform(-settings.saturation_delta,
settings.saturation_delta) + 1
img = ImageEnhance.Color(img).enhance(delta)
return img
def random_hue(img, settings):
prob = random.uniform(0, 1)
if prob < settings._hue_prob:
delta = random.uniform(-settings._hue_delta, settings._hue_delta)
if prob < settings.hue_prob:
delta = random.uniform(-settings.hue_delta, settings.hue_delta)
img_hsv = np.array(img.convert('HSV'))
img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
......@@ -303,9 +457,9 @@ def distort_image(img, settings):
def expand_image(img, bbox_labels, img_width, img_height, settings):
prob = random.uniform(0, 1)
if prob < settings._expand_prob:
if settings._expand_max_ratio - 1 >= 0.01:
expand_ratio = random.uniform(1, settings._expand_max_ratio)
if prob < settings.expand_prob:
if settings.expand_max_ratio - 1 >= 0.01:
expand_ratio = random.uniform(1, settings.expand_max_ratio)
height = int(img_height * expand_ratio)
width = int(img_width * expand_ratio)
h_off = math.floor(random.uniform(0, height - img_height))
......@@ -314,7 +468,7 @@ def expand_image(img, bbox_labels, img_width, img_height, settings):
(width - w_off) / img_width,
(height - h_off) / img_height)
expand_img = np.ones((height, width, 3))
expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean))
expand_img = np.uint8(expand_img * np.squeeze(settings.img_mean))
expand_img = Image.fromarray(expand_img)
expand_img.paste(img, (int(w_off), int(h_off)))
bbox_labels = transform_labels(bbox_labels, expand_bbox)
......
......@@ -15,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, False, "Whether use PyramidBox model.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('confs_threshold', float, 0.25, "Confidence threshold to draw bbox.")
add_arg('image_path', str, '', "The data root path.")
add_arg('model_dir', str, '', "The model path.")
......@@ -168,6 +168,9 @@ def detect_face(image, shrink):
return_numpy=False)
detection = np.array(detection)
# layout: xmin, ymin, xmax. ymax, score
if detection.shape == (1, ):
print("No face detected")
return np.array([[0, 0, 0, 0, 0]])
det_conf = detection[:, 1]
det_xmin = image_shape[2] * detection[:, 2] / shrink
det_ymin = image_shape[1] * detection[:, 3] / shrink
......@@ -227,6 +230,33 @@ def multi_scale_test(image, max_shrink):
return det_s, det_b
def multi_scale_test_pyramid(image, max_shrink):
# shrink detecting and shrink only detect big face
det_b = detect_face(image, 0.25)
index = np.where(
np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
> 30)[0]
det_b = det_b[index, :]
st = [0.5, 0.75, 1.25, 1.5, 1.75, 2.25]
for i in range(len(st)):
if (st[i] <= max_shrink):
det_temp = detect_face(image, st[i])
# enlarge only detect small face
if st[i] > 1:
index = np.where(
np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
det_temp = det_temp[index, :]
else:
index = np.where(
np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
det_temp = det_temp[index, :]
det_b = np.row_stack((det_b, det_temp))
return det_b
def get_im_shrink(image_shape):
max_shrink_v1 = (0x7fffffff / 577.0 /
(image_shape[1] * image_shape[2]))**0.5
......@@ -272,7 +302,8 @@ def infer(args, batch_size, data_args):
det0 = detect_face(image, shrink)
det1 = flip_test(image, shrink)
[det2, det3] = multi_scale_test(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3))
det4 = multi_scale_test_pyramid(image, max_shrink)
det = np.row_stack((det0, det1, det2, det3, det4))
dets = bbox_vote(det)
image_name = image_path.split('/')[-1]
......
import os
import shutil
import numpy as np
import time
import argparse
import functools
import reader
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from pyramidbox import PyramidBox
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('parallel', bool, True, "parallel")
add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('batch_size', int, 20, "Minibatch size.")
add_arg('num_iteration', int, 10, "Epoch number.")
add_arg('skip_reader', bool, False, "Whether to skip data reader.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.")
add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image height.")
#yapf: enable
def train(args, config, train_file_list, optimizer_method):
learning_rate = args.learning_rate
batch_size = args.batch_size
height = args.resize_h
width = args.resize_w
use_gpu = args.use_gpu
use_pyramidbox = args.use_pyramidbox
model_save_dir = args.model_save_dir
pretrained_model = args.pretrained_model
skip_reader = args.skip_reader
num_iterations = args.num_iteration
parallel = args.parallel
num_classes = 2
image_shape = [3, height, width]
devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
devices_num = len(devices.split(","))
fetches = []
network = PyramidBox(image_shape, num_classes,
sub_network=use_pyramidbox)
if use_pyramidbox:
face_loss, head_loss, loss = network.train()
fetches = [face_loss, head_loss]
else:
loss = network.vgg_ssd_loss()
fetches = [loss]
epocs = 12880 / batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01
]
if optimizer_method == "momentum":
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=boundaries, values=values),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.0005),
)
else:
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.0005),
)
optimizer.minimize(loss)
fluid.memory_optimize(fluid.default_main_program())
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
start_pass = 0
if pretrained_model:
if pretrained_model.isdigit():
start_pass = int(pretrained_model) + 1
pretrained_model = os.path.join(model_save_dir, pretrained_model)
print("Resume from %s " %(pretrained_model))
if not os.path.exists(pretrained_model):
raise ValueError("The pre-trained model path [%s] does not exist." %
(pretrained_model))
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
if parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=use_gpu, loss_name=loss.name)
train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
def tensor(data, place, lod=None):
t = fluid.core.LoDTensor()
t.set(data, place)
if lod:
t.set_lod(lod)
return t
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feed_data = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
def run(iterations, feed_data):
# global feed_data
reader_time = []
run_time = []
for batch_id in range(iterations):
start_time = time.time()
if not skip_reader:
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feed_data = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
end_time = time.time()
reader_time.append(end_time - start_time)
start_time = time.time()
if parallel:
fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
feed=feed_data)
else:
fetch_vars = exe.run(fluid.default_main_program(),
feed=feed_data,
fetch_list=fetches)
end_time = time.time()
run_time.append(end_time - start_time)
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
if not args.use_pyramidbox:
print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
else:
print("Batch {0}, face loss {1}, head loss {2}".format(
batch_id, fetch_vars[0], fetch_vars[1]))
return reader_time, run_time
# start-up
run(2, feed_data)
# profiling
start = time.time()
if not parallel:
with profiler.profiler('All', 'total', '/tmp/profile_file'):
reader_time, run_time = run(num_iterations, feed_data)
else:
reader_time, run_time = run(num_iterations, feed_data)
end = time.time()
total_time = end - start
print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
total_time, np.sum(reader_time), np.sum(run_time)))
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
data_dir = 'data/WIDERFACE/WIDER_train/images/'
train_file_list = 'label/train_gt_widerface.res'
config = reader.Settings(
data_dir=data_dir,
resize_h=args.resize_h,
resize_w=args.resize_w,
apply_expand=False,
mean_value=[104., 117., 123.],
ap_version='11point')
train(args, config, train_file_list, optimizer_method="momentum")
......@@ -81,10 +81,7 @@ class PyramidBox(object):
if self.is_infer:
return [self.image]
else:
return [
self.image, self.face_box, self.head_box, self.gt_label,
self.difficult
]
return [self.image, self.face_box, self.head_box, self.gt_label]
def _input(self):
self.image = fluid.layers.data(
......@@ -96,8 +93,6 @@ class PyramidBox(object):
name='head_box', shape=[4], dtype='float32', lod_level=1)
self.gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1)
self.difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
def _vgg(self):
self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2)
......@@ -144,7 +139,8 @@ class PyramidBox(object):
stride=2,
groups=ch,
param_attr=w_attr,
bias_attr=False)
bias_attr=False,
use_cudnn=True)
else:
upsampling = fluid.layers.resize_bilinear(
conv1, out_shape=up_to.shape[2:])
......@@ -385,6 +381,7 @@ class PyramidBox(object):
self.box_vars,
overlap_threshold=0.35,
neg_overlap=0.35)
face_loss.persistable = True
head_loss = fluid.layers.ssd_loss(
self.head_mbox_loc,
self.head_mbox_conf,
......@@ -394,9 +391,13 @@ class PyramidBox(object):
self.box_vars,
overlap_threshold=0.35,
neg_overlap=0.35)
head_loss.persistable = True
face_loss = fluid.layers.reduce_sum(face_loss)
face_loss.persistable = True
head_loss = fluid.layers.reduce_sum(head_loss)
head_loss.persistable = True
total_loss = face_loss + head_loss
total_loss.persistable = True
return face_loss, head_loss, total_loss
def infer(self, main_program=None):
......@@ -410,5 +411,8 @@ class PyramidBox(object):
self.face_mbox_conf,
self.prior_boxes,
self.box_vars,
nms_threshold=0.45)
nms_threshold=0.3,
nms_top_k=5000,
keep_top_k=750,
score_threshold=0.05)
return test_program, face_nmsed_out
......@@ -22,6 +22,9 @@ import xml.etree.ElementTree
import os
import time
import copy
import random
import cv2
from data_util import GeneratorEnqueuer
class Settings(object):
......@@ -36,112 +39,130 @@ class Settings(object):
apply_expand=True,
ap_version='11point',
toy=0):
self._dataset = dataset
self._ap_version = ap_version
self._toy = toy
self._data_dir = data_dir
self._apply_distort = apply_distort
self._apply_expand = apply_expand
self._resize_height = resize_h
self._resize_width = resize_w
self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
self.dataset = dataset
self.ap_version = ap_version
self.toy = toy
self.data_dir = data_dir
self.apply_distort = apply_distort
self.apply_expand = apply_expand
self.resize_height = resize_h
self.resize_width = resize_w
self.img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
'float32')
self._expand_prob = 0.5
self._expand_max_ratio = 4
self._hue_prob = 0.5
self._hue_delta = 18
self._contrast_prob = 0.5
self._contrast_delta = 0.5
self._saturation_prob = 0.5
self._saturation_delta = 0.5
self._brightness_prob = 0.5
self.expand_prob = 0.5
self.expand_max_ratio = 4
self.hue_prob = 0.5
self.hue_delta = 18
self.contrast_prob = 0.5
self.contrast_delta = 0.5
self.saturation_prob = 0.5
self.saturation_delta = 0.5
self.brightness_prob = 0.5
# _brightness_delta is the normalized value by 256
# self._brightness_delta = 32
self._brightness_delta = 0.125
@property
def dataset(self):
return self._dataset
@property
def ap_version(self):
return self._ap_version
self.brightness_delta = 0.125
self.scale = 0.007843 # 1 / 127.5
self.data_anchor_sampling_prob = 0.5
self.min_face_size = 8.0
@property
def toy(self):
return self._toy
@property
def apply_expand(self):
return self._apply_expand
def draw_image(faces_pred, img, resize_val):
for i in range(len(faces_pred)):
draw_rotate_rectange(img, faces_pred[i], resize_val, (0, 255, 0), 3)
@property
def apply_distort(self):
return self._apply_distort
@property
def data_dir(self):
return self._data_dir
def draw_rotate_rectange(img, face, resize_val, color, thickness):
cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
face[3] * resize_val), int(face[2] * resize_val)), color, thickness)
@data_dir.setter
def data_dir(self, data_dir):
self._data_dir = data_dir
cv2.line(img, (int(face[3] * resize_val), int(face[2] * resize_val)), (int(
face[3] * resize_val), int(face[4] * resize_val)), color, thickness)
@property
def label_list(self):
return self._label_list
cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
@property
def resize_h(self):
return self._resize_height
cv2.line(img, (int(face[3] * resize_val), int(face[4] * resize_val)), (int(
face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
@property
def resize_w(self):
return self._resize_width
@property
def img_mean(self):
return self._img_mean
def preprocess(img, bbox_labels, mode, settings):
def preprocess(img, bbox_labels, mode, settings, image_path):
img_width, img_height = img.size
sampled_labels = bbox_labels
if mode == 'train':
if settings._apply_distort:
if settings.apply_distort:
img = image_util.distort_image(img, settings)
if settings._apply_expand:
if settings.apply_expand:
img, bbox_labels, img_width, img_height = image_util.expand_image(
img, bbox_labels, img_width, img_height, settings)
# sampling
batch_sampler = []
# hard-code here
batch_sampler.append(
image_util.sampler(1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,
True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,
True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,
True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,
True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,
True))
sampled_bbox = image_util.generate_batch_samples(
batch_sampler, bbox_labels, img_width, img_height)
img = np.array(img)
if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width, img_height)
img = Image.fromarray(img)
img = img.resize((settings.resize_w, settings.resize_h), Image.ANTIALIAS)
prob = random.uniform(0., 1.)
if prob > settings.data_anchor_sampling_prob:
scale_array = np.array([16, 32, 64, 128, 256, 512])
batch_sampler.append(
image_util.sampler(1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2,
0.0, True))
sampled_bbox = image_util.generate_batch_random_samples(
batch_sampler, bbox_labels, img_width, img_height, scale_array,
settings.resize_width, settings.resize_height)
img = np.array(img)
# Debug
# img_save = Image.fromarray(img)
# img_save.save('img_orig.jpg')
if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image_sampling(
img, bbox_labels, sampled_bbox[idx], img_width, img_height,
settings.resize_width, settings.resize_height,
settings.min_face_size)
img = img.astype('uint8')
# Debug: visualize the gt bbox
visualize_bbox = 0
if visualize_bbox:
img_show = img
draw_image(sampled_labels, img_show, settings.resize_height)
img_show = Image.fromarray(img_show)
img_show.save('final_img_show.jpg')
img = Image.fromarray(img)
# Debug
# img.save('final_img.jpg')
else:
# hard-code here
batch_sampler.append(
image_util.sampler(1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0,
0.0, True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0,
0.0, True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0,
0.0, True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0,
0.0, True))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0,
0.0, True))
sampled_bbox = image_util.generate_batch_samples(
batch_sampler, bbox_labels, img_width, img_height)
img = np.array(img)
if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sampled_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width, img_height,
settings.resize_width, settings.resize_height,
settings.min_face_size)
img = Image.fromarray(img)
img = img.resize((settings.resize_width, settings.resize_height),
Image.ANTIALIAS)
img = np.array(img)
if mode == 'train':
......@@ -160,27 +181,26 @@ def preprocess(img, bbox_labels, mode, settings):
img = img[[2, 1, 0], :, :]
img = img.astype('float32')
img -= settings.img_mean
img = img * 0.007843
img = img * settings.scale
return img, sampled_labels
def put_txt_in_dict(input_txt):
def load_file_list(input_txt):
with open(input_txt, 'r') as f_dir:
lines_input_txt = f_dir.readlines()
dict_input_txt = {}
file_dict = {}
num_class = 0
for i in range(len(lines_input_txt)):
tmp_line_txt = lines_input_txt[i].strip('\n\t\r')
if '--' in tmp_line_txt:
if i != 0:
num_class += 1
dict_input_txt[num_class] = []
file_dict[num_class] = []
dict_name = tmp_line_txt
dict_input_txt[num_class].append(tmp_line_txt)
file_dict[num_class].append(tmp_line_txt)
if '--' not in tmp_line_txt:
if len(tmp_line_txt) > 6:
# tmp_line_txt = tmp_line_txt[:-2]
split_str = tmp_line_txt.split(' ')
x1_min = float(split_str[0])
y1_min = float(split_str[1])
......@@ -188,11 +208,11 @@ def put_txt_in_dict(input_txt):
y2_max = float(split_str[3])
tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
x2_max) + ' ' + str(y2_max)
dict_input_txt[num_class].append(tmp_line_txt)
file_dict[num_class].append(tmp_line_txt)
else:
dict_input_txt[num_class].append(tmp_line_txt)
file_dict[num_class].append(tmp_line_txt)
return dict_input_txt
return file_dict
def expand_bboxes(bboxes,
......@@ -219,67 +239,106 @@ def expand_bboxes(bboxes,
return expand_boxes
def pyramidbox(settings, file_list, mode, shuffle):
dict_input_txt = {}
dict_input_txt = put_txt_in_dict(file_list)
def train_generator(settings, file_list, batch_size, shuffle=True):
file_dict = load_file_list(file_list)
while True:
if shuffle:
random.shuffle(file_dict)
images, face_boxes, head_boxes, label_ids = [], [], [], []
label_offs = [0]
def reader():
if mode == 'train' and shuffle:
random.shuffle(dict_input_txt)
for index_image in range(len(dict_input_txt)):
image_name = dict_input_txt[index_image][0] + '.jpg'
for index_image in file_dict.keys():
image_name = file_dict[index_image][0] + '.jpg'
image_path = os.path.join(settings.data_dir, image_name)
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
im_width, im_height = im.size
# layout: label | xmin | ymin | xmax | ymax
if mode == 'train':
bbox_labels = []
for index_box in range(len(dict_input_txt[index_image])):
if index_box >= 2:
bbox_sample = []
temp_info_box = dict_input_txt[index_image][
index_box].split(' ')
xmin = float(temp_info_box[0])
ymin = float(temp_info_box[1])
w = float(temp_info_box[2])
h = float(temp_info_box[3])
xmax = xmin + w
ymax = ymin + h
bbox_sample.append(1)
bbox_sample.append(float(xmin) / im_width)
bbox_sample.append(float(ymin) / im_height)
bbox_sample.append(float(xmax) / im_width)
bbox_sample.append(float(ymax) / im_height)
bbox_labels.append(bbox_sample)
im, sample_labels = preprocess(im, bbox_labels, mode, settings)
sample_labels = np.array(sample_labels)
if len(sample_labels) == 0: continue
im = im.astype('float32')
boxes = sample_labels[:, 1:5]
lbls = [1] * len(boxes)
difficults = [1] * len(boxes)
yield im, boxes, expand_bboxes(boxes), lbls, difficults
if mode == 'test':
yield im, image_path
return reader
bbox_labels = []
for index_box in range(len(file_dict[index_image])):
if index_box >= 2:
bbox_sample = []
temp_info_box = file_dict[index_image][index_box].split(' ')
xmin = float(temp_info_box[0])
ymin = float(temp_info_box[1])
w = float(temp_info_box[2])
h = float(temp_info_box[3])
xmax = xmin + w
ymax = ymin + h
bbox_sample.append(1)
bbox_sample.append(float(xmin) / im_width)
bbox_sample.append(float(ymin) / im_height)
bbox_sample.append(float(xmax) / im_width)
bbox_sample.append(float(ymax) / im_height)
bbox_labels.append(bbox_sample)
im, sample_labels = preprocess(im, bbox_labels, "train", settings,
image_path)
sample_labels = np.array(sample_labels)
if len(sample_labels) == 0: continue
im = im.astype('float32')
face_box = sample_labels[:, 1:5]
head_box = expand_bboxes(face_box)
label = [1] * len(face_box)
images.append(im)
face_boxes.extend(face_box)
head_boxes.extend(head_box)
label_ids.extend(label)
label_offs.append(label_offs[-1] + len(face_box))
if len(images) == batch_size:
images = np.array(images).astype('float32')
face_boxes = np.array(face_boxes).astype('float32')
head_boxes = np.array(head_boxes).astype('float32')
label_ids = np.array(label_ids).astype('int32')
yield images, face_boxes, head_boxes, label_ids, label_offs
images, face_boxes, head_boxes = [], [], []
label_ids, label_offs = [], [0]
def train_batch_reader(settings,
file_list,
batch_size,
shuffle=True,
num_workers=8):
try:
enqueuer = GeneratorEnqueuer(
train_generator(settings, file_list, batch_size, shuffle),
use_multiprocessing=False)
enqueuer.start(max_queue_size=24, workers=num_workers)
generator_output = None
while True:
while enqueuer.is_running():
if not enqueuer.queue.empty():
generator_output = enqueuer.queue.get()
break
else:
time.sleep(0.01)
yield generator_output
generator_output = None
finally:
if enqueuer is not None:
enqueuer.stop()
def train(settings, file_list, shuffle=True):
return pyramidbox(settings, file_list, 'train', shuffle)
def test(settings, file_list):
file_dict = load_file_list(file_list)
def reader():
for index_image in file_dict.keys():
image_name = file_dict[index_image][0] + '.jpg'
image_path = os.path.join(settings.data_dir, image_name)
im = Image.open(image_path)
if im.mode == 'L':
im = im.convert('RGB')
yield im, image_path
def test(settings, file_list):
return pyramidbox(settings, file_list, 'test', False)
return reader
def infer(settings, image_path):
......@@ -288,8 +347,8 @@ def infer(settings, image_path):
if img.mode == 'L':
img = im.convert('RGB')
im_width, im_height = img.size
if settings.resize_w and settings.resize_h:
img = img.resize((settings.resize_w, settings.resize_h),
if settings.resize_width and settings.resize_height:
img = img.resize((settings.resize_width, settings.resize_height),
Image.ANTIALIAS)
img = np.array(img)
# HWC to CHW
......@@ -300,9 +359,7 @@ def infer(settings, image_path):
img = img[[2, 1, 0], :, :]
img = img.astype('float32')
img -= settings.img_mean
img = img * 0.007843
img = [img]
img = np.array(img)
return img
img = img * settings.scale
return np.array([img])
return batch_reader
......@@ -15,42 +15,52 @@ parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('parallel', bool, True, "parallel")
add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('batch_size', int, 12, "Minibatch size.")
add_arg('num_passes', int, 120, "Epoch number.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('dataset', str, 'WIDERFACE', "coco2014, coco2017, and pascalvoc.")
add_arg('model_save_dir', str, 'model', "The path to save model.")
add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image height.")
add_arg('parallel', bool, True, "parallel")
add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('batch_size', int, 12, "Minibatch size.")
add_arg('num_passes', int, 160, "Epoch number.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.")
add_arg('pretrained_model', str, './pretrained/', "The init model path.")
add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image height.")
add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.")
#yapf: enable
def train(args, data_args, learning_rate, batch_size, pretrained_model,
num_passes, optimizer_method):
def train(args, config, train_file_list, optimizer_method):
learning_rate = args.learning_rate
batch_size = args.batch_size
num_passes = args.num_passes
height = args.resize_h
width = args.resize_w
use_gpu = args.use_gpu
use_pyramidbox = args.use_pyramidbox
model_save_dir = args.model_save_dir
pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt
num_classes = 2
image_shape = [3, height, width]
devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
devices_num = len(devices.split(","))
image_shape = [3, data_args.resize_h, data_args.resize_w]
fetches = []
network = PyramidBox(image_shape, num_classes,
sub_network=args.use_pyramidbox)
if args.use_pyramidbox:
sub_network=use_pyramidbox)
if use_pyramidbox:
face_loss, head_loss, loss = network.train()
fetches = [face_loss, head_loss]
else:
loss = network.vgg_ssd_loss()
fetches = [loss]
epocs = 12880 / batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
steps_per_pass = 12880 / batch_size
boundaries = [steps_per_pass * 50, steps_per_pass * 80,
steps_per_pass * 120, steps_per_pass * 140]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
learning_rate * 0.1, learning_rate * 0.01
......@@ -70,9 +80,10 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
)
optimizer.minimize(loss)
# fluid.memory_optimize(fluid.default_main_program())
if with_memory_optimization:
fluid.memory_optimize(fluid.default_main_program())
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -80,7 +91,7 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
if pretrained_model:
if pretrained_model.isdigit():
start_pass = int(pretrained_model) + 1
pretrained_model = os.path.join(args.model_save_dir, pretrained_model)
pretrained_model = os.path.join(model_save_dir, pretrained_model)
print("Resume from %s " %(pretrained_model))
if not os.path.exists(pretrained_model):
......@@ -92,11 +103,9 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
if args.parallel:
train_exe = fluid.ParallelExecutor(
use_cuda=args.use_gpu, loss_name=loss.name)
use_cuda=use_gpu, loss_name=loss.name)
train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=network.feeds())
train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
def save_model(postfix):
model_path = os.path.join(model_save_dir, postfix)
......@@ -105,20 +114,34 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
print 'save models to %s' % (model_path)
fluid.io.save_persistables(exe, model_path)
def tensor(data, place, lod=None):
t = fluid.core.LoDTensor()
t.set(data, place)
if lod:
t.set_lod(lod)
return t
for pass_id in range(start_pass, num_passes):
start_time = time.time()
prev_start_time = start_time
end_time = 0
for batch_id, data in enumerate(train_reader()):
for batch_id in range(steps_per_pass):
im, face_box, head_box, labels, lod = next(train_reader)
im_t = tensor(im, place)
box1 = tensor(face_box, place, [lod])
box2 = tensor(head_box, place, [lod])
lbl_t = tensor(labels, place, [lod])
feeding = {'image': im_t, 'face_box': box1,
'head_box': box2, 'gt_label': lbl_t}
prev_start_time = start_time
start_time = time.time()
if len(data) < 2 * devices_num: continue
if args.parallel:
fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
feed=feeder.feed(data))
feed=feeding)
else:
fetch_vars = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
feed=feeding,
fetch_list=fetches)
end_time = time.time()
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
......@@ -143,22 +166,13 @@ if __name__ == '__main__':
data_dir = 'data/WIDERFACE/WIDER_train/images/'
train_file_list = 'label/train_gt_widerface.res'
val_file_list = 'label/val_gt_widerface.res'
model_save_dir = args.model_save_dir
data_args = reader.Settings(
dataset=args.dataset,
config = reader.Settings(
data_dir=data_dir,
resize_h=args.resize_h,
resize_w=args.resize_w,
apply_distort=True,
apply_expand=False,
mean_value=[104., 117., 123],
mean_value=[104., 117., 123.],
ap_version='11point')
train(
args,
data_args=data_args,
learning_rate=args.learning_rate,
batch_size=args.batch_size,
pretrained_model=args.pretrained_model,
num_passes=args.num_passes,
optimizer_method="momentum")
train(args, config, train_file_list, optimizer_method="momentum")
运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。
## 代码结构
```
├── network.py # 网络结构定义脚本
├── train.py # 训练任务脚本
├── eval.py # 评估脚本
├── infer.py # 预测脚本
├── cityscape.py # 数据预处理脚本
└── utils.py # 定义通用的函数
```
## 简介
Image Cascade Network(ICNet)主要用于图像实时语义分割。相较于其它压缩计算的方法,ICNet即考虑了速度,也考虑了准确性。
ICNet的主要思想是将输入图像变换为不同的分辨率,然后用不同计算复杂度的子网络计算不同分辨率的输入,然后将结果合并。ICNet由三个子网络组成,计算复杂度高的网络处理低分辨率输入,计算复杂度低的网络处理分辨率高的网络,通过这种方式在高分辨率图像的准确性和低复杂度网络的效率之间获得平衡。
整个网络结构如下:
<p align="center">
<img src="images/icnet.png" width="620" hspace='10'/> <br/>
<strong>图 1</strong>
</p>
## 数据准备
本文采用Cityscape数据集,请前往[Cityscape官网](https://www.cityscapes-dataset.com)注册下载。下载数据之后,按照[这里](https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/createTrainIdLabelImgs.py#L3)的说明和工具处理数据。
处理之后的数据
```
data/cityscape/
|-- gtFine
| |-- test
| |-- train
| `-- val
|-- leftImg8bit
| |-- test
| |-- train
| `-- val
|-- train.list
`-- val.list
```
其中,train.list和val.list分别是用于训练和测试的列表文件,第一列为输入图像数据,第二列为标注数据,两列用空格分开。示例如下:
```
leftImg8bit/train/stuttgart/stuttgart_000021_000019_leftImg8bit.png gtFine/train/stuttgart/stuttgart_000021_000019_gtFine_labelTrainIds.png
leftImg8bit/train/stuttgart/stuttgart_000072_000019_leftImg8bit.png gtFine/train/stuttgart/stuttgart_000072_000019_gtFine_labelTrainIds.png
```
完成数据下载和准备后,需要修改`cityscape.py`脚本中对应的数据地址。
## 模型训练与预测
### 训练
执行以下命令进行训练,同时指定checkpoint保存路径:
```
python train.py --batch_size=16 --use_gpu=True --checkpoint_path="./chkpnt/"
```
使用以下命令获得更多使用说明:
```
python train.py --help
```
训练过程中会根据用户的设置,输出训练集上每个网络分支的`loss`, 示例如下:
```
Iter[0]; train loss: 2.338; sub4_loss: 3.367; sub24_loss: 4.120; sub124_loss: 0.151
```
### 测试
执行以下命令在`Cityscape`测试数据集上进行测试:
```
python eval.py --model_path="./model/" --use_gpu=True
```
需要通过选项`--model_path`指定模型文件。
测试脚本的输出的评估指标为[mean IoU]()。
### 预测
执行以下命令对指定的数据进行预测:
```
python infer.py \
--model_path="./model" \
--images_path="./data/cityscape/" \
--images_list="./data/cityscape/infer.list"
```
通过选项`--images_list`指定列表文件,列表文件中每一行为一个要预测的图片的路径。
预测结果默认保存到当前路径下的`output`文件夹下。
## 实验结果
图2为在`CityScape`训练集上的训练的Loss曲线:
<p align="center">
<img src="images/train_loss.png" width="620" hspace='10'/> <br/>
<strong>图 2</strong>
</p>
在训练集上训练,在validation数据集上验证的结果为:mean_IoU=67.0%(论文67.7%)
图3是使用`infer.py`脚本预测产生的结果示例,其中,第一行为输入的原始图片,第二行为人工的标注,第三行为我们模型计算的结果。
<p align="center">
<img src="images/result.png" width="620" hspace='10'/> <br/>
<strong>图 3</strong>
</p>
## 其他信息
|数据集 | pretrained model |
|---|---|
|CityScape | [Model]()[md: ] |
## 参考
- [ICNet for Real-Time Semantic Segmentation on High-Resolution Images](https://arxiv.org/abs/1704.08545)
"""Reader for Cityscape dataset.
"""
import os
import cv2
import numpy as np
import paddle.v2 as paddle
DATA_PATH = "./data/cityscape"
TRAIN_LIST = DATA_PATH + "/train.list"
TEST_LIST = DATA_PATH + "/val.list"
IGNORE_LABEL = 255
NUM_CLASSES = 19
TRAIN_DATA_SHAPE = (3, 720, 720)
TEST_DATA_SHAPE = (3, 1024, 2048)
IMG_MEAN = np.array((103.939, 116.779, 123.68), dtype=np.float32)
def train_data_shape():
return TRAIN_DATA_SHAPE
def test_data_shape():
return TEST_DATA_SHAPE
def num_classes():
return NUM_CLASSES
class DataGenerater:
def __init__(self, data_list, mode="train", flip=True, scaling=True):
self.flip = flip
self.scaling = scaling
self.image_label = []
with open(data_list, 'r') as f:
for line in f:
image_file, label_file = line.strip().split(' ')
self.image_label.append((image_file, label_file))
def create_train_reader(self, batch_size):
"""
Create a reader for train dataset.
"""
def reader():
np.random.shuffle(self.image_label)
images = []
labels_sub1 = []
labels_sub2 = []
labels_sub4 = []
count = 0
for image, label in self.image_label:
image, label_sub1, label_sub2, label_sub4 = self.process_train_data(
image, label)
count += 1
images.append(image)
labels_sub1.append(label_sub1)
labels_sub2.append(label_sub2)
labels_sub4.append(label_sub4)
if count == batch_size:
yield self.mask(
np.array(images),
np.array(labels_sub1),
np.array(labels_sub2), np.array(labels_sub4))
images = []
labels_sub1 = []
labels_sub2 = []
labels_sub4 = []
count = 0
if images:
yield self.mask(
np.array(images),
np.array(labels_sub1),
np.array(labels_sub2), np.array(labels_sub4))
return reader
def create_test_reader(self):
"""
Create a reader for test dataset.
"""
def reader():
for image, label in self.image_label:
image, label = self.load(image, label)
image = paddle.image.to_chw(image)[np.newaxis, :]
label = label[np.newaxis, :, :, np.newaxis].astype("float32")
label_mask = np.where((label != IGNORE_LABEL).flatten())[
0].astype("int32")
yield image, label, label_mask
return reader
def process_train_data(self, image, label):
"""
Process training data.
"""
image, label = self.load(image, label)
if self.flip:
image, label = self.random_flip(image, label)
if self.scaling:
image, label = self.random_scaling(image, label)
image, label = self.resize(image, label, out_size=TRAIN_DATA_SHAPE[1:])
label = label.astype("float32")
label_sub1 = paddle.image.to_chw(self.scale_label(label, factor=4))
label_sub2 = paddle.image.to_chw(self.scale_label(label, factor=8))
label_sub4 = paddle.image.to_chw(self.scale_label(label, factor=16))
image = paddle.image.to_chw(image)
return image, label_sub1, label_sub2, label_sub4
def load(self, image, label):
"""
Load image from file.
"""
image = paddle.image.load_image(
DATA_PATH + "/" + image, is_color=True).astype("float32")
image -= IMG_MEAN
label = paddle.image.load_image(
DATA_PATH + "/" + label, is_color=False).astype("float32")
return image, label
def random_flip(self, image, label):
"""
Flip image and label randomly.
"""
r = np.random.rand(1)
if r > 0.5:
image = paddle.image.left_right_flip(image, is_color=True)
label = paddle.image.left_right_flip(label, is_color=False)
return image, label
def random_scaling(self, image, label):
"""
Scale image and label randomly.
"""
scale = np.random.uniform(0.5, 2.0, 1)[0]
h_new = int(image.shape[0] * scale)
w_new = int(image.shape[1] * scale)
image = cv2.resize(image, (w_new, h_new))
label = cv2.resize(
label, (w_new, h_new), interpolation=cv2.INTER_NEAREST)
return image, label
def padding_as(self, image, h, w, is_color):
"""
Padding image.
"""
pad_h = max(image.shape[0], h) - image.shape[0]
pad_w = max(image.shape[1], w) - image.shape[1]
if is_color:
return np.pad(image, ((0, pad_h), (0, pad_w), (0, 0)), 'constant')
else:
return np.pad(image, ((0, pad_h), (0, pad_w)), 'constant')
def resize(self, image, label, out_size):
"""
Resize image and label by padding or cropping.
"""
ignore_label = IGNORE_LABEL
label = label - ignore_label
if len(label.shape) == 2:
label = label[:, :, np.newaxis]
combined = np.concatenate((image, label), axis=2)
combined = self.padding_as(
combined, out_size[0], out_size[1], is_color=True)
combined = paddle.image.random_crop(
combined, out_size[0], is_color=True)
image = combined[:, :, 0:3]
label = combined[:, :, 3:4] + ignore_label
return image, label
def scale_label(self, label, factor):
"""
Scale label according to factor.
"""
h = label.shape[0] / factor
w = label.shape[1] / factor
return cv2.resize(
label, (h, w), interpolation=cv2.INTER_NEAREST)[:, :, np.newaxis]
def mask(self, image, label0, label1, label2):
"""
Get mask for valid pixels.
"""
mask_sub1 = np.where(((label0 < (NUM_CLASSES + 1)) & (
label0 != IGNORE_LABEL)).flatten())[0].astype("int32")
mask_sub2 = np.where(((label1 < (NUM_CLASSES + 1)) & (
label1 != IGNORE_LABEL)).flatten())[0].astype("int32")
mask_sub4 = np.where(((label2 < (NUM_CLASSES + 1)) & (
label2 != IGNORE_LABEL)).flatten())[0].astype("int32")
return image.astype(
"float32"), label0, mask_sub1, label1, mask_sub2, label2, mask_sub4
def train(batch_size=32, flip=True, scaling=True):
"""
Cityscape training set reader.
It returns a reader, in which each result is a batch with batch_size samples.
:param batch_size: The batch size of each result return by the reader.
:type batch_size: int
:param flip: Whether flip images randomly.
:type batch_size: bool
:param scaling: Whether scale images randomly.
:type batch_size: bool
:return: Training reader.
:rtype: callable
"""
reader = DataGenerater(
TRAIN_LIST, flip=flip, scaling=scaling).create_train_reader(batch_size)
return reader
def test():
"""
Cityscape validation set reader.
It returns a reader, in which each result is a sample.
:return: Training reader.
:rtype: callable
"""
reader = DataGenerater(TEST_LIST).create_test_reader()
return reader
def infer(image_list=TEST_LIST):
"""
Infer set reader.
It returns a reader, in which each result is a sample.
:param image_list: The image list file in which each line is a path of image to be infered.
:type batch_size: str
:return: Infer reader.
:rtype: callable
"""
reader = DataGenerater(image_list).create_test_reader()
"""Evaluator for ICNet model."""
import paddle.fluid as fluid
import numpy as np
from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
from icnet import icnet
import cityscape
import argparse
import functools
import sys
import os
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('model_path', str, None, "Model path.")
add_arg('use_gpu', bool, True, "Whether use GPU to test.")
# yapf: enable
def cal_mean_iou(wrong, correct):
sum = wrong + cerroct
true_num = (sum != 0).sum()
for i in len(sum):
if sum[i] == 0:
sum[i] = 1
return (cerroct.astype("float64") / sum).sum() / true_num
def create_iou(predict, label, mask, num_classes, image_shape):
predict = fluid.layers.resize_bilinear(predict, out_shape=image_shape[1:3])
predict = fluid.layers.transpose(predict, perm=[0, 2, 3, 1])
predict = fluid.layers.reshape(predict, shape=[-1, num_classes])
label = fluid.layers.reshape(label, shape=[-1, 1])
_, predict = fluid.layers.topk(predict, k=1)
predict = fluid.layers.cast(predict, dtype="float32")
predict = fluid.layers.gather(predict, mask)
label = fluid.layers.gather(label, mask)
label = fluid.layers.cast(label, dtype="int32")
predict = fluid.layers.cast(predict, dtype="int32")
iou, out_w, out_r = fluid.layers.mean_iou(predict, label, num_classes)
return iou, out_w, out_r
def eval(args):
data_shape = cityscape.test_data_shape()
num_classes = cityscape.num_classes()
# define network
images = fluid.layers.data(name='image', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int32')
mask = fluid.layers.data(name='mask', shape=[-1], dtype='int32')
_, _, sub124_out = icnet(images, num_classes,
np.array(data_shape[1:]).astype("float32"))
iou, out_w, out_r = create_iou(sub124_out, label, mask, num_classes,
data_shape)
inference_program = fluid.default_main_program().clone(for_test=True)
# prepare environment
place = fluid.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
assert os.path.exists(args.model_path)
fluid.io.load_params(exe, args.model_path)
print "loaded model from: %s" % args.model_path
sys.stdout.flush()
fetch_vars = [iou, out_w, out_r]
out_wrong = np.zeros([num_classes]).astype("int64")
out_right = np.zeros([num_classes]).astype("int64")
count = 0
test_reader = cityscape.test()
for data in test_reader():
count += 1
result = exe.run(inference_program,
feed=get_feeder_data(
data, place, for_test=True),
fetch_list=fetch_vars)
out_wrong += result[1]
out_right += result[2]
print "count: %s; current iou: %.3f;\r" % (count, result[0]),
sys.stdout.flush()
iou = cal_mean_iou(out_wrong, out_right)
print "\nmean iou: %.3f" % iou
def main():
args = parser.parse_args()
print_arguments(args)
eval(args)
if __name__ == "__main__":
main()
import paddle.fluid as fluid
import numpy as np
import sys
def conv(input,
k_h,
k_w,
c_o,
s_h,
s_w,
relu=False,
padding="VALID",
biased=False,
name=None):
act = None
tmp = input
if relu:
act = "relu"
if padding == "SAME":
padding_h = max(k_h - s_h, 0)
padding_w = max(k_w - s_w, 0)
padding_top = padding_h / 2
padding_left = padding_w / 2
padding_bottom = padding_h - padding_top
padding_right = padding_w - padding_left
padding = [
0, 0, 0, 0, padding_top, padding_bottom, padding_left, padding_right
]
tmp = fluid.layers.pad(tmp, padding)
tmp = fluid.layers.conv2d(
tmp,
num_filters=c_o,
filter_size=[k_h, k_w],
stride=[s_h, s_w],
groups=1,
act=act,
bias_attr=biased,
use_cudnn=False,
name=name)
return tmp
def atrous_conv(input,
k_h,
k_w,
c_o,
dilation,
relu=False,
padding="VALID",
biased=False,
name=None):
act = None
if relu:
act = "relu"
tmp = input
if padding == "SAME":
padding_h = max(k_h - s_h, 0)
padding_w = max(k_w - s_w, 0)
padding_top = padding_h / 2
padding_left = padding_w / 2
padding_bottom = padding_h - padding_top
padding_right = padding_w - padding_left
padding = [
0, 0, 0, 0, padding_top, padding_bottom, padding_left, padding_right
]
tmp = fluid.layers.pad(tmp, padding)
tmp = fluid.layers.conv2d(
input,
num_filters=c_o,
filter_size=[k_h, k_w],
dilation=dilation,
groups=1,
act=act,
bias_attr=biased,
use_cudnn=False,
name=name)
return tmp
def zero_padding(input, padding):
return fluid.layers.pad(input,
[0, 0, 0, 0, padding, padding, padding, padding])
def bn(input, relu=False, name=None, is_test=False):
act = None
if relu:
act = 'relu'
name = input.name.split(".")[0] + "_bn"
tmp = fluid.layers.batch_norm(
input, act=act, momentum=0.95, epsilon=1e-5, name=name)
return tmp
def avg_pool(input, k_h, k_w, s_h, s_w, name=None, padding=0):
temp = fluid.layers.pool2d(
input,
pool_size=[k_h, k_w],
pool_type="avg",
pool_stride=[s_h, s_w],
pool_padding=padding,
name=name)
return temp
def max_pool(input, k_h, k_w, s_h, s_w, name=None, padding=0):
temp = fluid.layers.pool2d(
input,
pool_size=[k_h, k_w],
pool_type="max",
pool_stride=[s_h, s_w],
pool_padding=padding,
name=name)
return temp
def interp(input, out_shape):
out_shape = list(out_shape.astype("int32"))
return fluid.layers.resize_bilinear(input, out_shape=out_shape)
def dilation_convs(input):
tmp = res_block(input, filter_num=256, padding=1, name="conv3_2")
tmp = res_block(tmp, filter_num=256, padding=1, name="conv3_3")
tmp = res_block(tmp, filter_num=256, padding=1, name="conv3_4")
tmp = proj_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_1")
tmp = res_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_2")
tmp = res_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_3")
tmp = res_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_4")
tmp = res_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_5")
tmp = res_block(tmp, filter_num=512, padding=2, dilation=2, name="conv4_6")
tmp = proj_block(
tmp, filter_num=1024, padding=4, dilation=4, name="conv5_1")
tmp = res_block(tmp, filter_num=1024, padding=4, dilation=4, name="conv5_2")
tmp = res_block(tmp, filter_num=1024, padding=4, dilation=4, name="conv5_3")
return tmp
def pyramis_pooling(input, input_shape):
shape = np.ceil(input_shape / 32).astype("int32")
h, w = shape
pool1 = avg_pool(input, h, w, h, w)
pool1_interp = interp(pool1, shape)
pool2 = avg_pool(input, h / 2, w / 2, h / 2, w / 2)
pool2_interp = interp(pool2, shape)
pool3 = avg_pool(input, h / 3, w / 3, h / 3, w / 3)
pool3_interp = interp(pool3, shape)
pool4 = avg_pool(input, h / 4, w / 4, h / 4, w / 4)
pool4_interp = interp(pool4, shape)
conv5_3_sum = input + pool4_interp + pool3_interp + pool2_interp + pool1_interp
return conv5_3_sum
def shared_convs(image):
tmp = conv(image, 3, 3, 32, 2, 2, padding='SAME', name="conv1_1_3_3_s2")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 3, 3, 32, 1, 1, padding='SAME', name="conv1_2_3_3")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 3, 3, 64, 1, 1, padding='SAME', name="conv1_3_3_3")
tmp = bn(tmp, relu=True)
tmp = max_pool(tmp, 3, 3, 2, 2, padding=[1, 1])
tmp = proj_block(tmp, filter_num=128, padding=0, name="conv2_1")
tmp = res_block(tmp, filter_num=128, padding=1, name="conv2_2")
tmp = res_block(tmp, filter_num=128, padding=1, name="conv2_3")
tmp = proj_block(tmp, filter_num=256, padding=1, stride=2, name="conv3_1")
return tmp
def res_block(input, filter_num, padding=0, dilation=None, name=None):
tmp = conv(input, 1, 1, filter_num / 4, 1, 1, name=name + "_1_1_reduce")
tmp = bn(tmp, relu=True)
tmp = zero_padding(tmp, padding=padding)
if dilation is None:
tmp = conv(tmp, 3, 3, filter_num / 4, 1, 1, name=name + "_3_3")
else:
tmp = atrous_conv(
tmp, 3, 3, filter_num / 4, dilation, name=name + "_3_3")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase")
tmp = bn(tmp, relu=False)
tmp = input + tmp
tmp = fluid.layers.relu(tmp, name=name + "_relu")
return tmp
def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
name=None):
proj = conv(
input, 1, 1, filter_num, stride, stride, name=name + "_1_1_proj")
proj_bn = bn(proj, relu=False)
tmp = conv(
input, 1, 1, filter_num / 4, stride, stride, name=name + "_1_1_reduce")
tmp = bn(tmp, relu=True)
tmp = zero_padding(tmp, padding=padding)
if padding == 0:
padding = 'SAME'
else:
padding = 'VALID'
if dilation is None:
tmp = conv(
tmp,
3,
3,
filter_num / 4,
1,
1,
padding=padding,
name=name + "_3_3")
else:
tmp = atrous_conv(
tmp,
3,
3,
filter_num / 4,
dilation,
padding=padding,
name=name + "_3_3")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase")
tmp = bn(tmp, relu=False)
tmp = proj_bn + tmp
tmp = fluid.layers.relu(tmp, name=name + "_relu")
return tmp
def sub_net_4(input, input_shape):
tmp = interp(input, out_shape=np.ceil(input_shape / 32))
tmp = dilation_convs(tmp)
tmp = pyramis_pooling(tmp, input_shape)
tmp = conv(tmp, 1, 1, 256, 1, 1, name="conv5_4_k1")
tmp = bn(tmp, relu=True)
tmp = interp(tmp, input_shape / 16)
return tmp
def sub_net_2(input):
tmp = conv(input, 1, 1, 128, 1, 1, name="conv3_1_sub2_proj")
tmp = bn(tmp, relu=False)
return tmp
def sub_net_1(input):
tmp = conv(input, 3, 3, 32, 2, 2, padding='SAME', name="conv1_sub1")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 3, 3, 32, 2, 2, padding='SAME', name="conv2_sub1")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 3, 3, 64, 2, 2, padding='SAME', name="conv3_sub1")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 1, 1, 128, 1, 1, name="conv3_sub1_proj")
tmp = bn(tmp, relu=False)
return tmp
def CCF24(sub2_out, sub4_out, input_shape):
tmp = zero_padding(sub4_out, padding=2)
tmp = atrous_conv(tmp, 3, 3, 128, 2, name="conv_sub4")
tmp = bn(tmp, relu=False)
tmp = tmp + sub2_out
tmp = fluid.layers.relu(tmp)
tmp = interp(tmp, input_shape / 8)
return tmp
def CCF124(sub1_out, sub24_out, input_shape):
tmp = zero_padding(sub24_out, padding=2)
tmp = atrous_conv(tmp, 3, 3, 128, 2, name="conv_sub2")
tmp = bn(tmp, relu=False)
tmp = tmp + sub1_out
tmp = fluid.layers.relu(tmp)
tmp = interp(tmp, input_shape / 4)
return tmp
def icnet(data, num_classes, input_shape):
image_sub1 = data
image_sub2 = interp(data, out_shape=input_shape * 0.5)
s_convs = shared_convs(image_sub2)
sub4_out = sub_net_4(s_convs, input_shape)
sub2_out = sub_net_2(s_convs)
sub1_out = sub_net_1(image_sub1)
sub24_out = CCF24(sub2_out, sub4_out, input_shape)
sub124_out = CCF124(sub1_out, sub24_out, input_shape)
conv6_cls = conv(
sub124_out, 1, 1, num_classes, 1, 1, biased=True, name="conv6_cls")
sub4_out = conv(
sub4_out, 1, 1, num_classes, 1, 1, biased=True, name="sub4_out")
sub24_out = conv(
sub24_out, 1, 1, num_classes, 1, 1, biased=True, name="sub24_out")
return sub4_out, sub24_out, conv6_cls
"""Infer for ICNet model."""
import cityscape
import argparse
import functools
import sys
import os
import cv2
import paddle.fluid as fluid
import paddle.v2 as paddle
from icnet import icnet
from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import numpy as np
IMG_MEAN = np.array((103.939, 116.779, 123.68), dtype=np.float32)
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('model_path', str, None, "Model path.")
add_arg('images_list', str, None, "List file with images to be infered.")
add_arg('images_path', str, None, "The images path.")
add_arg('out_path', str, "./output", "Output path.")
add_arg('use_gpu', bool, True, "Whether use GPU to test.")
# yapf: enable
data_shape = [3, 1024, 2048]
num_classes = 19
label_colours = [
[128, 64, 128],
[244, 35, 231],
[69, 69, 69]
# 0 = road, 1 = sidewalk, 2 = building
,
[102, 102, 156],
[190, 153, 153],
[153, 153, 153]
# 3 = wall, 4 = fence, 5 = pole
,
[250, 170, 29],
[219, 219, 0],
[106, 142, 35]
# 6 = traffic light, 7 = traffic sign, 8 = vegetation
,
[152, 250, 152],
[69, 129, 180],
[219, 19, 60]
# 9 = terrain, 10 = sky, 11 = person
,
[255, 0, 0],
[0, 0, 142],
[0, 0, 69]
# 12 = rider, 13 = car, 14 = truck
,
[0, 60, 100],
[0, 79, 100],
[0, 0, 230]
# 15 = bus, 16 = train, 17 = motocycle
,
[119, 10, 32]
]
# 18 = bicycle
def color(input):
"""
Convert infered result to color image.
"""
result = []
for i in input.flatten():
result.append(
[label_colours[i][2], label_colours[i][1], label_colours[i][0]])
result = np.array(result).reshape([input.shape[0], input.shape[1], 3])
return result
def infer(args):
data_shape = cityscape.test_data_shape()
num_classes = cityscape.num_classes()
# define network
images = fluid.layers.data(name='image', shape=data_shape, dtype='float32')
_, _, sub124_out = icnet(images, num_classes,
np.array(data_shape[1:]).astype("float32"))
predict = fluid.layers.resize_bilinear(
sub124_out, out_shape=data_shape[1:3])
predict = fluid.layers.transpose(predict, perm=[0, 2, 3, 1])
predict = fluid.layers.reshape(predict, shape=[-1, num_classes])
_, predict = fluid.layers.topk(predict, k=1)
predict = fluid.layers.reshape(
predict,
shape=[data_shape[1], data_shape[2], -1]) # batch_size should be 1
inference_program = fluid.default_main_program().clone(for_test=True)
# prepare environment
place = fluid.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
assert os.path.exists(args.model_path)
fluid.io.load_params(exe, args.model_path)
print "loaded model from: %s" % args.model_path
sys.stdout.flush()
if not os.path.isdir(args.out_path):
os.makedirs(args.out_path)
for line in open(args.images_list):
image_file = args.images_path + "/" + line.strip()
filename = os.path.basename(image_file)
image = paddle.image.load_image(
image_file, is_color=True).astype("float32")
image -= IMG_MEAN
img = paddle.image.to_chw(image)[np.newaxis, :]
image_t = fluid.core.LoDTensor()
image_t.set(img, place)
result = exe.run(inference_program,
feed={"image": image_t},
fetch_list=[predict])
cv2.imwrite(args.out_path + "/" + filename + "_result.png",
color(result[0]))
def main():
args = parser.parse_args()
print_arguments(args)
infer(args)
if __name__ == "__main__":
main()
"""Trainer for ICNet model."""
from icnet import icnet
import cityscape
import argparse
import functools
import sys
import time
import paddle.fluid as fluid
import numpy as np
from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 16, "Minibatch size.")
add_arg('checkpoint_path', str, None, "Checkpoint svae path.")
add_arg('init_model', str, None, "Pretrain model path.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('random_mirror', bool, True, "Whether prepare by random mirror.")
add_arg('random_scaling', bool, True, "Whether prepare by random scaling.")
# yapf: enable
LAMBDA1 = 0.16
LAMBDA2 = 0.4
LAMBDA3 = 1.0
LEARNING_RATE = 0.003
POWER = 0.9
LOG_PERIOD = 1
CHECKPOINT_PERIOD = 1000
TOTAL_STEP = 60000
no_grad_set = []
def create_loss(predict, label, mask, num_classes):
predict = fluid.layers.transpose(predict, perm=[0, 2, 3, 1])
predict = fluid.layers.reshape(predict, shape=[-1, num_classes])
label = fluid.layers.reshape(label, shape=[-1, 1])
predict = fluid.layers.gather(predict, mask)
label = fluid.layers.gather(label, mask)
label = fluid.layers.cast(label, dtype="int64")
loss = fluid.layers.softmax_with_cross_entropy(predict, label)
no_grad_set.append(label.name)
return fluid.layers.reduce_mean(loss)
def poly_decay():
global_step = _decay_step_counter()
with init_on_cpu():
decayed_lr = LEARNING_RATE * (fluid.layers.pow(
(1 - global_step / TOTAL_STEP), POWER))
return decayed_lr
def train(args):
data_shape = cityscape.train_data_shape()
num_classes = cityscape.num_classes()
# define network
images = fluid.layers.data(name='image', shape=data_shape, dtype='float32')
label_sub1 = fluid.layers.data(name='label_sub1', shape=[1], dtype='int32')
label_sub2 = fluid.layers.data(name='label_sub2', shape=[1], dtype='int32')
label_sub4 = fluid.layers.data(name='label_sub4', shape=[1], dtype='int32')
mask_sub1 = fluid.layers.data(name='mask_sub1', shape=[-1], dtype='int32')
mask_sub2 = fluid.layers.data(name='mask_sub2', shape=[-1], dtype='int32')
mask_sub4 = fluid.layers.data(name='mask_sub4', shape=[-1], dtype='int32')
sub4_out, sub24_out, sub124_out = icnet(
images, num_classes, np.array(data_shape[1:]).astype("float32"))
loss_sub4 = create_loss(sub4_out, label_sub4, mask_sub4, num_classes)
loss_sub24 = create_loss(sub24_out, label_sub2, mask_sub2, num_classes)
loss_sub124 = create_loss(sub124_out, label_sub1, mask_sub1, num_classes)
reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124
regularizer = fluid.regularizer.L2Decay(0.0001)
optimizer = fluid.optimizer.Momentum(
learning_rate=poly_decay(), momentum=0.9, regularization=regularizer)
_, params_grads = optimizer.minimize(reduced_loss, no_grad_set=no_grad_set)
# prepare environment
place = fluid.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if args.init_model is not None:
print "load model from: %s" % args.init_model
sys.stdout.flush()
fluid.io.load_params(exe, args.init_model)
iter_id = 0
t_loss = 0.
sub4_loss = 0.
sub24_loss = 0.
sub124_loss = 0.
train_reader = cityscape.train(
args.batch_size, flip=args.random_mirror, scaling=args.random_scaling)
while True:
# train a pass
for data in train_reader():
if iter_id > TOTAL_STEP:
return
iter_id += 1
results = exe.run(
feed=get_feeder_data(data, place),
fetch_list=[reduced_loss, loss_sub4, loss_sub24, loss_sub124])
t_loss += results[0]
sub4_loss += results[1]
sub24_loss += results[2]
sub124_loss += results[3]
# training log
if iter_id % LOG_PERIOD == 0:
print "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % (
iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD,
sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD)
t_loss = 0.
sub4_loss = 0.
sub24_loss = 0.
sub124_loss = 0.
sys.stdout.flush()
if iter_id % CHECKPOINT_PERIOD == 0:
dir_name = args.checkpoint_path + "/" + str(iter_id)
fluid.io.save_persistables(exe, dirname=dir_name)
print "Saved checkpoint: %s" % (dir_name)
def main():
args = parser.parse_args()
print_arguments(args)
train(args)
if __name__ == "__main__":
main()
"""Contains common utility functions."""
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
from paddle.fluid import core
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int32")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def get_feeder_data(data, place, for_test=False):
feed_dict = {}
image_t = core.LoDTensor()
image_t.set(data[0], place)
feed_dict["image"] = image_t
if not for_test:
labels_sub1_t = core.LoDTensor()
labels_sub2_t = core.LoDTensor()
labels_sub4_t = core.LoDTensor()
mask_sub1_t = core.LoDTensor()
mask_sub2_t = core.LoDTensor()
mask_sub4_t = core.LoDTensor()
labels_sub1_t.set(data[1], place)
labels_sub2_t.set(data[3], place)
mask_sub1_t.set(data[2], place)
mask_sub2_t.set(data[4], place)
labels_sub4_t.set(data[5], place)
mask_sub4_t.set(data[6], place)
feed_dict["label_sub1"] = labels_sub1_t
feed_dict["label_sub2"] = labels_sub2_t
feed_dict["mask_sub1"] = mask_sub1_t
feed_dict["mask_sub2"] = mask_sub2_t
feed_dict["label_sub4"] = labels_sub4_t
feed_dict["mask_sub4"] = mask_sub4_t
else:
label_t = core.LoDTensor()
mask_t = core.LoDTensor()
label_t.set(data[1], place)
mask_t.set(data[2], place)
feed_dict["label"] = label_t
feed_dict["mask"] = mask_t
return feed_dict
......@@ -38,7 +38,7 @@ class InferTaskConfig(object):
batch_size = 10
# the parameters for beam search.
beam_size = 5
max_length = 256
max_out_len = 256
# the number of decoded sentences to output.
n_best = 1
# the flags indicating whether to output the special tokens.
......@@ -104,23 +104,28 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
break
# The placeholder for batch_size in compile time. Must be -1 currently to be
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
input_descs = {
# The actual data shape of src_word is:
# [batch_size * max_src_len_in_batch, 1]
"src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"src_word": [(batch_size * seq_len, 1L), "int64", 2],
# The actual data shape of src_pos is:
# [batch_size * max_src_len_in_batch, 1]
"src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"src_pos": [(batch_size * seq_len, 1L), "int64"],
# This input is used to remove attention weights on paddings in the
# encoder.
# The actual data shape of src_slf_attn_bias is:
# [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
"src_slf_attn_bias":
[(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This shape input is used to reshape the output of embedding layer.
"src_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention.
......@@ -129,24 +134,23 @@ input_descs = {
"src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
# The actual data shape of trg_word is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"trg_word": [(batch_size * seq_len, 1L), "int64",
2], # lod_level is only used in fast decoder.
# The actual data shape of trg_pos is:
# [batch_size * max_trg_len_in_batch, 1]
"trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"trg_pos": [(batch_size * seq_len, 1L), "int64"],
# This input is used to remove attention weights on paddings and
# subsequent words in the decoder.
# The actual data shape of trg_slf_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
"trg_slf_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This input is used to remove attention weights on paddings of the source
# input in the encoder-decoder attention.
# The actual data shape of trg_src_attn_bias is:
# [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
"trg_src_attn_bias": [(1, ModelHyperParams.n_head,
(ModelHyperParams.max_length + 1),
(ModelHyperParams.max_length + 1)), "float32"],
"trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
seq_len), "float32"],
# This shape input is used to reshape the output of embedding layer.
"trg_data_shape": [(3L, ), "int32"],
# This shape input is used to reshape before softmax in self attention.
......@@ -162,15 +166,18 @@ input_descs = {
# This input is used in independent decoder program for inference.
# The actual data shape of enc_output is:
# [batch_size, max_src_len_in_batch, d_model]
"enc_output": [(1, (ModelHyperParams.max_length + 1),
ModelHyperParams.d_model), "float32"],
"enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
# The actual data shape of label_word is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
"lbl_word": [(batch_size * seq_len, 1L), "int64"],
# This input is used to mask out the loss of paddding tokens.
# The actual data shape of label_weight is:
# [batch_size * max_trg_len_in_batch, 1]
"lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
"lbl_weight": [(batch_size * seq_len, 1L), "float32"],
# These inputs are used to change the shape tensor in beam-search decoder.
"trg_slf_attn_pre_softmax_shape_delta": [(2L, ), "int32"],
"trg_slf_attn_post_softmax_shape_delta": [(4L, ), "int32"],
"init_score": [(batch_size, 1L), "float32"],
}
# Names of word embedding table which might be reused for weight sharing.
......@@ -205,3 +212,12 @@ decoder_util_input_fields = (
label_data_input_fields = (
"lbl_word",
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
"init_score",
"trg_src_attn_bias", )
fast_decoder_util_input_fields = decoder_util_input_fields + (
"trg_slf_attn_pre_softmax_shape_delta",
"trg_slf_attn_post_softmax_shape_delta", )
......@@ -7,6 +7,7 @@ import paddle.fluid as fluid
import model
from model import wrap_encoder as encoder
from model import wrap_decoder as decoder
from model import fast_decode as fast_decoder
from config import *
from train import pad_batch_data
import reader
......@@ -87,7 +88,8 @@ def translate_batch(exe,
output_unk=True):
"""
Run the encoder program once and run the decoder program multiple times to
implement beam search externally.
implement beam search externally. This is deprecated since a faster beam
search decoder based solely on Fluid operators has been added.
"""
# Prepare data for encoder and run the encoder.
enc_in_data = pad_batch_data(
......@@ -297,7 +299,32 @@ def translate_batch(exe,
return seqs, scores[:, :n_best].tolist()
def infer(args):
def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx,
eos_idx=ModelHyperParams.eos_idx,
output_bos=InferTaskConfig.output_bos,
output_eos=InferTaskConfig.output_eos):
"""
Post-process the beam-search decoded sequence. Truncate from the first
<eos> and remove the <bos> and <eos> tokens currently.
"""
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
def py_infer(test_data, trg_idx2word):
"""
Inference by beam search implented by python, while the calculations from
symbols to probilities execute by Fluid operators.
"""
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
......@@ -341,49 +368,8 @@ def infer(args):
fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
# This is used here to set dropout to the test mode.
encoder_program = fluid.io.get_inference_program(
target_vars=[enc_output], main_program=encoder_program)
decoder_program = fluid.io.get_inference_program(
target_vars=[predict], main_program=decoder_program)
test_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.test_file_pattern,
batch_size=args.batch_size,
use_token_batch=False,
pool_size=args.pool_size,
sort_type=reader.SortType.NONE,
shuffle=False,
shuffle_batch=False,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=ModelHyperParams.max_length,
clip_last_batch=False)
trg_idx2word = test_data.load_dict(
dict_path=args.trg_vocab_fpath, reverse=True)
def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx,
eos_idx=ModelHyperParams.eos_idx,
output_bos=InferTaskConfig.output_bos,
output_eos=InferTaskConfig.output_eos):
"""
Post-process the beam-search decoded sequence. Truncate from the first
<eos> and remove the <bos> and <eos> tokens currently.
"""
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
encoder_program = encoder_program.inference_optimize()
decoder_program = decoder_program.inference_optimize()
for batch_id, data in enumerate(test_data.batch_generator()):
batch_seqs, batch_scores = translate_batch(
......@@ -397,7 +383,7 @@ def infer(args):
(decoder_data_input_fields[-1], ),
[predict.name],
InferTaskConfig.beam_size,
InferTaskConfig.max_length,
InferTaskConfig.max_out_len,
InferTaskConfig.n_best,
len(data),
ModelHyperParams.n_head,
......@@ -416,6 +402,154 @@ def infer(args):
print(" ".join([trg_idx2word[idx] for idx in seq]))
def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
bos_idx, n_head, d_model, place):
"""
Put all padded data needed by beam search decoder into a dict.
"""
src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
[inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
# start tokens
trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64")
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, 1, 1]).astype("float32")
# These shape tensors are used in reshape_op.
src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([-1, 1, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array(
[-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
trg_slf_attn_pre_softmax_shape = np.array(
[-1, 1], dtype="int32") # only the first time step
trg_slf_attn_post_softmax_shape = np.array(
[-1, n_head, 1, 1], dtype="int32") # only the first time step
trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array(
[-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
# These inputs are used to change the shapes in the loop of while op.
attn_pre_softmax_shape_delta = np.array([0, 1], dtype="int32")
attn_post_softmax_shape_delta = np.array([0, 0, 0, 1], dtype="int32")
def to_lodtensor(data, place, lod=None):
data_tensor = fluid.LoDTensor()
data_tensor.set(data, place)
if lod is not None:
data_tensor.set_lod(lod)
return data_tensor
# beamsearch_op must use tensors with lod
init_score = to_lodtensor(
np.zeros_like(
trg_word, dtype="float32"),
place, [range(trg_word.shape[0] + 1)] * 2)
trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2)
data_input_dict = dict(
zip(data_input_names, [
src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
trg_src_attn_bias
]))
util_input_dict = dict(
zip(util_input_names, [
src_data_shape, src_slf_attn_pre_softmax_shape,
src_slf_attn_post_softmax_shape, trg_data_shape,
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape,
attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta
]))
input_dict = dict(data_input_dict.items() + util_input_dict.items())
return input_dict
def fast_infer(test_data, trg_idx2word):
"""
Inference by beam search decoder based solely on Fluid operators.
"""
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
out_ids, out_scores = fast_decoder(
ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
ModelHyperParams.weight_sharing, InferTaskConfig.beam_size,
InferTaskConfig.max_out_len, ModelHyperParams.eos_idx)
fluid.io.load_vars(
exe,
InferTaskConfig.model_path,
vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
fluid.default_main_program().list_vars()))
# This is used here to set dropout to the test mode.
infer_program = fluid.default_main_program().inference_optimize()
for batch_id, data in enumerate(test_data.batch_generator()):
data_input = prepare_batch_input(
data, encoder_data_input_fields + fast_decoder_data_input_fields,
encoder_util_input_fields + fast_decoder_util_input_fields,
ModelHyperParams.eos_idx, ModelHyperParams.bos_idx,
ModelHyperParams.n_head, ModelHyperParams.d_model, place)
seq_ids, seq_scores = exe.run(infer_program,
feed=data_input,
fetch_list=[out_ids, out_scores],
return_numpy=False)
# How to parse the results:
# Suppose the lod of seq_ids is:
# [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
# then from lod[0]:
# there are 2 source sentences, beam width is 3.
# from lod[1]:
# the first source sentence has 3 hyps; the lengths are 12, 12, 16
# the second source sentence has 3 hyps; the lengths are 14, 13, 15
hyps = [[] for i in range(len(data))]
scores = [[] for i in range(len(data))]
for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence
start = seq_ids.lod()[0][i]
end = seq_ids.lod()[0][i + 1]
for j in range(end - start): # for each candidate
sub_start = seq_ids.lod()[1][start + j]
sub_end = seq_ids.lod()[1][start + j + 1]
hyps[i].append(" ".join([
trg_idx2word[idx]
for idx in post_process_seq(
np.array(seq_ids)[sub_start:sub_end])
]))
scores[i].append(np.array(seq_scores)[sub_end - 1])
print hyps[i][-1]
if len(hyps[i]) >= InferTaskConfig.n_best:
break
def infer(args, inferencer=fast_infer):
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
test_data = reader.DataReader(
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
fpattern=args.test_file_pattern,
batch_size=args.batch_size,
use_token_batch=False,
pool_size=args.pool_size,
sort_type=reader.SortType.NONE,
shuffle=False,
shuffle_batch=False,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=ModelHyperParams.max_length,
clip_last_batch=False)
trg_idx2word = test_data.load_dict(
dict_path=args.trg_vocab_fpath, reverse=True)
inferencer(test_data, trg_idx2word)
if __name__ == "__main__":
args = parse_args()
infer(args)
......@@ -30,7 +30,8 @@ def multi_head_attention(queries,
n_head=1,
dropout_rate=0.,
pre_softmax_shape=None,
post_softmax_shape=None):
post_softmax_shape=None,
cache=None):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that
......@@ -116,6 +117,10 @@ def multi_head_attention(queries,
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
k = cache["k"] = layers.concat([cache["k"], k], axis=1)
v = cache["v"] = layers.concat([cache["v"], v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
......@@ -203,7 +208,7 @@ def prepare_encoder(src_word,
enc_input = src_word_emb + src_pos_enc
enc_input = layers.reshape(
x=enc_input,
shape=[-1, src_max_len, src_emb_dim],
shape=[batch_size, seq_len, src_emb_dim],
actual_shape=src_data_shape)
return layers.dropout(
enc_input, dropout_prob=dropout_rate,
......@@ -285,7 +290,8 @@ def decoder_layer(dec_input,
slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None):
src_attn_post_softmax_shape=None,
cache=None):
""" The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention.
......@@ -301,7 +307,8 @@ def decoder_layer(dec_input,
n_head,
dropout_rate,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, )
slf_attn_post_softmax_shape,
cache, )
slf_attn_output = post_process_layer(
dec_input,
slf_attn_output,
......@@ -350,7 +357,8 @@ def decoder(dec_input,
slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None):
src_attn_post_softmax_shape=None,
caches=None):
"""
The decoder is composed of a stack of identical decoder_layer layers.
"""
......@@ -369,7 +377,8 @@ def decoder(dec_input,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, )
src_attn_post_softmax_shape,
None if caches is None else caches[i], )
dec_input = dec_output
return dec_output
......@@ -384,6 +393,8 @@ def make_all_inputs(input_fields):
name=input_field,
shape=input_descs[input_field][0],
dtype=input_descs[input_field][1],
lod_level=input_descs[input_field][2]
if len(input_descs[input_field]) == 3 else 0,
append_batch_size=False)
inputs.append(input_var)
return inputs
......@@ -517,7 +528,8 @@ def wrap_decoder(trg_vocab_size,
dropout_rate,
weight_sharing,
dec_inputs=None,
enc_output=None):
enc_output=None,
caches=None):
"""
The wrapper assembles together all needed layers for the decoder.
"""
......@@ -559,7 +571,8 @@ def wrap_decoder(trg_vocab_size,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, )
src_attn_post_softmax_shape,
caches, )
# Return logits for training and probs for inference.
if weight_sharing:
predict = layers.reshape(
......@@ -578,3 +591,145 @@ def wrap_decoder(trg_vocab_size,
shape=[-1, trg_vocab_size],
act="softmax" if dec_inputs is None else None)
return predict
def fast_decode(
src_vocab_size,
trg_vocab_size,
max_in_len,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
weight_sharing,
beam_size,
max_out_len,
eos_idx, ):
"""
Use beam search to decode. Caches will be used to store states of history
steps which can make the decoding faster.
"""
enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid,
dropout_rate, weight_sharing)
start_tokens, init_scores, trg_src_attn_bias, trg_data_shape, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \
src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \
attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta = \
make_all_inputs(fast_decoder_data_input_fields +
fast_decoder_util_input_fields)
def beam_search():
max_len = layers.fill_constant(
shape=[1], dtype=start_tokens.dtype, value=max_out_len)
step_idx = layers.fill_constant(
shape=[1], dtype=start_tokens.dtype, value=0)
cond = layers.less_than(x=step_idx, y=max_len)
while_op = layers.While(cond)
# array states will be stored for each step.
ids = layers.array_write(start_tokens, step_idx)
scores = layers.array_write(init_scores, step_idx)
# cell states will be overwrited at each step.
# caches contains states of history steps to reduce redundant
# computation in decoder.
caches = [{
"k": layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, 0, d_model],
dtype=enc_output.dtype,
value=0),
"v": layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, 0, d_model],
dtype=enc_output.dtype,
value=0)
} for i in range(n_layer)]
with while_op.block():
pre_ids = layers.array_read(array=ids, i=step_idx)
pre_scores = layers.array_read(array=scores, i=step_idx)
# sequence_expand can gather sequences according to lod thus can be
# used in beam search to sift states corresponding to selected ids.
pre_src_attn_bias = layers.sequence_expand(
x=trg_src_attn_bias, y=pre_scores)
pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
pre_caches = [{
"k": layers.sequence_expand(
x=cache["k"], y=pre_scores),
"v": layers.sequence_expand(
x=cache["v"], y=pre_scores),
} for cache in caches]
pre_pos = layers.elementwise_mul(
x=layers.fill_constant_batch_size_like(
input=pre_enc_output, # cann't use pre_ids here since it has lod
value=1,
shape=[-1, 1],
dtype=pre_ids.dtype),
y=layers.increment(
x=step_idx, value=1.0, in_place=False),
axis=0)
logits = wrap_decoder(
trg_vocab_size,
max_in_len,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
weight_sharing,
dec_inputs=(
pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
enc_output=pre_enc_output,
caches=pre_caches)
topk_scores, topk_indices = layers.topk(
input=layers.softmax(logits), k=beam_size)
accu_scores = layers.elementwise_add(
x=layers.log(topk_scores),
y=layers.reshape(
pre_scores, shape=[-1]),
axis=0)
# beam_search op uses lod to distinguish branches.
topk_indices = layers.lod_reset(topk_indices, pre_ids)
selected_ids, selected_scores = layers.beam_search(
pre_ids=pre_ids,
pre_scores=pre_scores,
ids=topk_indices,
scores=accu_scores,
beam_size=beam_size,
end_id=eos_idx)
layers.increment(x=step_idx, value=1.0, in_place=True)
# update states
layers.array_write(selected_ids, i=step_idx, array=ids)
layers.array_write(selected_scores, i=step_idx, array=scores)
layers.assign(pre_src_attn_bias, trg_src_attn_bias)
layers.assign(pre_enc_output, enc_output)
for i in range(n_layer):
layers.assign(pre_caches[i]["k"], caches[i]["k"])
layers.assign(pre_caches[i]["v"], caches[i]["v"])
layers.assign(
layers.elementwise_add(
x=slf_attn_pre_softmax_shape,
y=attn_pre_softmax_shape_delta),
slf_attn_pre_softmax_shape)
layers.assign(
layers.elementwise_add(
x=slf_attn_post_softmax_shape,
y=attn_post_softmax_shape_delta),
slf_attn_post_softmax_shape)
length_cond = layers.less_than(x=step_idx, y=max_len)
finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
finished_ids, finished_scores = layers.beam_search_decode(
ids, scores, beam_size=beam_size, end_id=eos_idx)
return finished_ids, finished_scores
finished_ids, finished_scores = beam_search()
return finished_ids, finished_scores
......@@ -198,7 +198,8 @@ class DataReader(object):
for line in f_obj:
fields = line.strip().split(self._delimiter)
if len(fields) != 2 or (self._only_src and len(fields) != 1):
if (not self._only_src and len(fields) != 2) or (self._only_src and
len(fields) != 1):
continue
sample_words = []
......@@ -275,7 +276,7 @@ class DataReader(object):
for sample_idx in self._sample_idxs:
if self._only_src:
yield (self._src_seq_ids[sample_idx])
yield (self._src_seq_ids[sample_idx], )
else:
yield (self._src_seq_ids[sample_idx],
self._trg_seq_ids[sample_idx][:-1],
......
......@@ -74,7 +74,7 @@ cd data/coco
python train.py --help
```
数据的读取行为定义在 `reader.py` 中,所有的图片都会被缩放到300x300。在训练时,数据还会进行图片增强和标签增强,图片增强包括对图片本身的随机扰动、扩张和翻转,标签增强包括随机裁剪:
数据的读取行为定义在 `reader.py` 中,所有的图片都会被缩放到300x300。在训练时还会对图片进行数据增强,包括随机扰动、扩张、翻转和裁剪:
- 扰动: 扰动图片亮度、对比度、饱和度和色相。
- 扩张: 将原始图片放进一张使用像素均值填充(随后会在减均值操作中减掉)的扩张图中,再对此图进行裁剪、缩放和翻转。
- 翻转: 水平翻转。
......

运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求,请按照安装文档中的说明更新PaddlePaddle安装版本。
运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。
# Optical Character Recognition
## 代码结构
```
├── ctc_reader.py # 下载、读取、处理数据。
├── crnn_ctc_model.py # 定义了训练网络、预测网络和evaluate网络。
├── ctc_train.py # 用于模型的训练。
├── infer.py # 加载训练好的模型文件,对新数据进行预测。
├── eval.py # 评估模型在指定数据集上的效果。
└── utils.py # 定义通用的函数。
```
这里将介绍如何在PaddlePaddle Fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。
## 1. CRNN-CTC
## 简介
本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为特征图, 然后使用`im2sequence op`将特征图转为序列,通过`双向GRU`学习到序列特征。训练过程选用的损失函数为CTC(Connectionist Temporal Classification) loss,最终的评估指标为样本级别的错误率。
本路径下各个文件的作用如下:
- **ctc_reader.py :** 下载、读取、处理数据。提供方法`train()``test()` 分别产生训练集和测试集的数据迭代器。
- **crnn_ctc_model.py :** 在该脚本中定义了训练网络、预测网络和evaluate网络。
- **ctc_train.py :** 用于模型的训练,可通过命令`python train.py --help` 获得使用方法。
- **infer.py :** 加载训练好的模型文件,对新数据进行预测。可通过命令`python infer.py --help` 获得使用方法。
- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python infer.py --help` 获得使用方法。
- **utility.py :** 实现的一些通用方法,包括参数配置、tensor的构造等。
### 1.1 数据
## 数据
数据的下载和简单预处理都在`ctc_reader.py`中实现。
#### 1.1.1 数据格式
### 数据示例
我们使用的训练和测试数据如`图1`所示,每张图片包含单行不定长的文字符串,这些图片都是经过检测算法进行预框选处理的。
我们使用的训练和测试数据如`图1`所示,每张图片包含单行不定长的文字符串,这些图片都是经过检测算法进行预框选处理的。
<p align="center">
<img src="images/demo.jpg" width="620" hspace='10'/> <br/>
......@@ -35,12 +34,12 @@
在训练集中,每张图片对应的label是汉字在词典中的索引。 `图1` 对应的label如下所示:
```
3835,8371,7191,2369,6876,4162,1938,168,1517,4590,3793
80,84,68,82,83,72,78,77,68,67
```
在上边这个label中,`3835` 表示字符‘两’的索引,`4590` 表示中文字符逗号的索引。
在上边这个label中,`80` 表示字符`Q`的索引,`67` 表示英文字符`D`的索引。
#### 1.1.2 数据准备
### 数据准备
**A. 训练集**
......@@ -105,7 +104,9 @@ data/test_images/00003.jpg
第三种:从stdin读入一张图片的path,然后进行一次inference.
#### 1.2 训练
## 模型训练与预测
### 训练
使用默认数据在GPU单卡上训练:
......@@ -121,7 +122,7 @@ env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True
执行`python ctc_train.py --help`可查看更多使用方式和参数详细说明。
图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。在45轮迭代训练中,测试集上最低错误率为第60轮的21.11%.
图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。在60轮迭代训练中,测试集上最低错误率为第32轮的22.0%.
<p align="center">
<img src="images/train.jpg" width="620" hspace='10'/> <br/>
......@@ -130,7 +131,7 @@ env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True
### 1.3 评估
## 测试
通过以下命令调用评估脚本用指定数据集对模型进行评估:
......@@ -144,7 +145,7 @@ env CUDA_VISIBLE_DEVICE=0 python eval.py \
执行`python ctc_train.py --help`可查看参数详细说明。
### 1.4 预测
### 预测
从标准输入读取一张图片的路径,并对齐进行预测:
......@@ -176,5 +177,3 @@ env CUDA_VISIBLE_DEVICE=0 python infer.py \
--model_path="models/model_00044_15000" \
--input_images_list="data/test.list"
```
>注意:因为版权原因,我们暂时停止提供中文数据集的下载和使用服务,你通过`ctc_reader.py`自动下载的数据将是含有30W图片的英文数据集。在英文数据集上的训练结果会稍后发布。
import paddle.fluid as fluid
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import math
def conv_bn_pool(input,
......@@ -8,7 +11,8 @@ def conv_bn_pool(input,
param=None,
bias=None,
param_0=None,
is_test=False):
is_test=False,
pooling=True):
tmp = input
for i in xrange(group):
tmp = fluid.layers.conv2d(
......@@ -19,32 +23,25 @@ def conv_bn_pool(input,
param_attr=param if param_0 is None else param_0,
act=None, # LinearActivation
use_cudnn=True)
#tmp = fluid.layers.Print(tmp)
tmp = fluid.layers.batch_norm(
input=tmp,
act=act,
param_attr=param,
bias_attr=bias,
is_test=is_test)
tmp = fluid.layers.pool2d(
input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=True,
ceil_mode=True)
if pooling:
tmp = fluid.layers.pool2d(
input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=True,
ceil_mode=True)
return tmp
def ocr_convs(input,
num,
with_bn,
regularizer=None,
gradient_clip=None,
is_test=False):
assert (num % 4 == 0)
def ocr_convs(input, regularizer=None, gradient_clip=None, is_test=False):
b = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
......@@ -63,7 +60,8 @@ def ocr_convs(input,
tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test)
tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test)
tmp = conv_bn_pool(tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test)
tmp = conv_bn_pool(
tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test, pooling=False)
return tmp
......@@ -75,8 +73,6 @@ def encoder_net(images,
is_test=False):
conv_features = ocr_convs(
images,
8,
True,
regularizer=regularizer,
gradient_clip=gradient_clip,
is_test=is_test)
......@@ -143,6 +139,7 @@ def ctc_train_net(images, label, args, num_classes):
L2_RATE = 0.0004
LR = 1.0e-3
MOMENTUM = 0.9
learning_rate_decay = None
regularizer = fluid.regularizer.L2Decay(L2_RATE)
fc_out = encoder_net(images, num_classes, regularizer=regularizer)
......@@ -155,7 +152,15 @@ def ctc_train_net(images, label, args, num_classes):
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)
inference_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Momentum(learning_rate=LR, momentum=MOMENTUM)
if learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay([
args.total_step / 4, args.total_step / 2, args.total_step * 3 / 4
], [LR, LR * 0.1, LR * 0.01, LR * 0.001])
else:
learning_rate = LR
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate, momentum=MOMENTUM)
_, params_grads = optimizer.minimize(sum_cost)
model_average = None
if args.average_window > 0:
......
......@@ -7,7 +7,7 @@ from os import path
from paddle.v2.image import load_image
import paddle.v2 as paddle
NUM_CLASSES = 10784
NUM_CLASSES = 95
DATA_SHAPE = [1, 48, 512]
DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
......
......@@ -14,7 +14,7 @@ parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('pass_num', int, 100, "Number of training epochs.")
add_arg('total_step', int, 720000, "Number of training iterations.")
add_arg('log_period', int, 1000, "Log period.")
add_arg('save_model_period', int, 15000, "Save model period. '-1' means never saving the model.")
add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.")
......@@ -22,7 +22,7 @@ add_arg('save_model_dir', str, "./models", "The directory the model to be s
add_arg('init_model', str, None, "The init model file of directory.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('min_average_window',int, 10000, "Min average window.")
add_arg('max_average_window',int, 15625, "Max average window. It is proposed to be set as the number of minibatch in a pass.")
add_arg('max_average_window',int, 12500, "Max average window. It is proposed to be set as the number of minibatch in a pass.")
add_arg('average_window', float, 0.15, "Average window.")
add_arg('parallel', bool, False, "Whether use parallel training.")
# yapf: enable
......@@ -90,54 +90,57 @@ def train(args, data_reader=ctc_reader):
results = [result[0] for result in results]
return results
def test(pass_id, batch_id):
def test(iter_num):
error_evaluator.reset(exe)
for data in test_reader():
exe.run(inference_program, feed=get_feeder_data(data, place))
_, test_seq_error = error_evaluator.eval(exe)
print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % (
time.time(), pass_id, batch_id, str(test_seq_error[0]))
print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
time.time(), iter_num, str(test_seq_error[0]))
def save_model(args, exe, pass_id, batch_id):
filename = "model_%05d_%d" % (pass_id, batch_id)
def save_model(args, exe, iter_num):
filename = "model_%05d" % iter_num
fluid.io.save_params(
exe, dirname=args.save_model_dir, filename=filename)
print "Saved model to: %s/%s." % (args.save_model_dir, filename)
for pass_id in range(args.pass_num):
batch_id = 1
iter_num = 0
while True:
total_loss = 0.0
total_seq_error = 0.0
# train a pass
for data in train_reader():
iter_num += 1
if iter_num > args.total_step:
return
results = train_one_batch(data)
total_loss += results[0]
total_seq_error += results[2]
# training log
if batch_id % args.log_period == 0:
print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s" % (
time.time(), pass_id, batch_id,
total_loss / (batch_id * args.batch_size),
total_seq_error / (batch_id * args.batch_size))
if iter_num % args.log_period == 0:
print "\nTime: %s; Iter[%d]; Avg Warp-CTC loss: %.3f; Avg seq err: %.3f" % (
time.time(), iter_num,
total_loss / (args.log_period * args.batch_size),
total_seq_error / (args.log_period * args.batch_size))
sys.stdout.flush()
total_loss = 0.0
total_seq_error = 0.0
# evaluate
if batch_id % args.eval_period == 0:
if iter_num % args.eval_period == 0:
if model_average:
with model_average.apply(exe):
test(pass_id, batch_id)
test(iter_num)
else:
test(pass_id, batch_d)
test(iter_num)
# save model
if batch_id % args.save_model_period == 0:
if iter_num % args.save_model_period == 0:
if model_average:
with model_average.apply(exe):
save_model(args, exe, pass_id, batch_id)
save_model(args, exe, iter_num)
else:
save_model(args, exe, pass_id, batch_id)
batch_id += 1
save_model(args, exe, iter_num)
def main():
......
......@@ -35,7 +35,7 @@ def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
# prepare environment
place = fluid.CPUPlace()
if use_gpu:
if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册