提交 b6ab2f4a 编写于 作者: G guosheng

Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix-transformer-batchsize-dev

...@@ -15,9 +15,7 @@ from multiprocessing import Manager, Process ...@@ -15,9 +15,7 @@ from multiprocessing import Manager, Process
import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.augmentor.trans_add_delta as trans_add_delta import data_utils.augmentor.trans_add_delta as trans_add_delta
from data_utils.util import suppress_complaints, suppress_signal from data_utils.util import suppress_complaints, suppress_signal
from data_utils.util import SharedNDArray, SharedMemoryPoolManager from data_utils.util import CriticalException, ForceExitWrapper
from data_utils.util import DaemonProcessGroup, batch_to_ndarray
from data_utils.util import CriticalException, ForceExitWrapper, EpochEndSignal
class SampleInfo(object): class SampleInfo(object):
...@@ -32,11 +30,12 @@ class SampleInfo(object): ...@@ -32,11 +30,12 @@ class SampleInfo(object):
label_bin_path (str): File containing the label data. label_bin_path (str): File containing the label data.
label_size (int): Byte count of the sample's label data. label_size (int): Byte count of the sample's label data.
label_frame_num (int): Label number of the sample. label_frame_num (int): Label number of the sample.
sample_name (str): Key of the sample
""" """
def __init__(self, feature_bin_path, feature_start, feature_size, def __init__(self, feature_bin_path, feature_start, feature_size,
feature_frame_num, feature_dim, label_bin_path, label_start, feature_frame_num, feature_dim, label_bin_path, label_start,
label_size, label_frame_num): label_size, label_frame_num, sample_name):
self.feature_bin_path = feature_bin_path self.feature_bin_path = feature_bin_path
self.feature_start = feature_start self.feature_start = feature_start
self.feature_size = feature_size self.feature_size = feature_size
...@@ -47,6 +46,7 @@ class SampleInfo(object): ...@@ -47,6 +46,7 @@ class SampleInfo(object):
self.label_start = label_start self.label_start = label_start
self.label_size = label_size self.label_size = label_size
self.label_frame_num = label_frame_num self.label_frame_num = label_frame_num
self.sample_name = sample_name
class SampleInfoBucket(object): class SampleInfoBucket(object):
...@@ -69,8 +69,8 @@ class SampleInfoBucket(object): ...@@ -69,8 +69,8 @@ class SampleInfoBucket(object):
split_sentence_threshold(int): Sentence whose length larger than split_sentence_threshold(int): Sentence whose length larger than
the value will trigger split operation. the value will trigger split operation.
split_sub_sentence_len(int): sub-sentence length is equal to split_sub_sentence_len(int): sub-sentence length is equal to
(split_sub_sentence_len + \ (split_sub_sentence_len
rand() % split_perturb). + rand() % split_perturb).
""" """
def __init__(self, def __init__(self,
...@@ -104,24 +104,33 @@ class SampleInfoBucket(object): ...@@ -104,24 +104,33 @@ class SampleInfoBucket(object):
feature_bin_path = self._feature_bin_paths[block_idx] feature_bin_path = self._feature_bin_paths[block_idx]
feature_desc_path = self._feature_desc_paths[block_idx] feature_desc_path = self._feature_desc_paths[block_idx]
label_desc_lines = open(label_desc_path).readlines()
feature_desc_lines = open(feature_desc_path).readlines() feature_desc_lines = open(feature_desc_path).readlines()
sample_num = int(label_desc_lines[0].split()[1]) label_desc_lines = []
assert sample_num == int(feature_desc_lines[0].split()[1]) if label_desc_path != "":
label_desc_lines = open(label_desc_path).readlines()
sample_num = int(feature_desc_lines[0].split()[1])
if label_desc_path != "":
assert sample_num == int(label_desc_lines[0].split()[1])
for i in xrange(sample_num): for i in xrange(sample_num):
feature_desc_split = feature_desc_lines[i + 1].split() feature_desc_split = feature_desc_lines[i + 1].split()
sample_name = feature_desc_split[0]
feature_start = int(feature_desc_split[2]) feature_start = int(feature_desc_split[2])
feature_size = int(feature_desc_split[3]) feature_size = int(feature_desc_split[3])
feature_frame_num = int(feature_desc_split[4]) feature_frame_num = int(feature_desc_split[4])
feature_dim = int(feature_desc_split[5]) feature_dim = int(feature_desc_split[5])
label_desc_split = label_desc_lines[i + 1].split() label_start = -1
label_start = int(label_desc_split[2]) label_size = -1
label_size = int(label_desc_split[3]) label_frame_num = feature_frame_num
label_frame_num = int(label_desc_split[4]) if label_desc_path != "":
assert feature_frame_num == label_frame_num label_desc_split = label_desc_lines[i + 1].split()
label_start = int(label_desc_split[2])
label_size = int(label_desc_split[3])
label_frame_num = int(label_desc_split[4])
assert feature_frame_num == label_frame_num
if self._split_sentence_threshold == -1 or \ if self._split_sentence_threshold == -1 or \
self._split_perturb == -1 or \ self._split_perturb == -1 or \
...@@ -131,7 +140,7 @@ class SampleInfoBucket(object): ...@@ -131,7 +140,7 @@ class SampleInfoBucket(object):
SampleInfo(feature_bin_path, feature_start, SampleInfo(feature_bin_path, feature_start,
feature_size, feature_frame_num, feature_dim, feature_size, feature_frame_num, feature_dim,
label_bin_path, label_start, label_size, label_bin_path, label_start, label_size,
label_frame_num)) label_frame_num, sample_name))
#split sentence #split sentence
else: else:
cur_frame_pos = 0 cur_frame_pos = 0
...@@ -152,16 +161,19 @@ class SampleInfoBucket(object): ...@@ -152,16 +161,19 @@ class SampleInfoBucket(object):
* feature_dim * 4, cur_frame_len * feature_dim * * feature_dim * 4, cur_frame_len * feature_dim *
4, cur_frame_len, feature_dim, label_bin_path, 4, cur_frame_len, feature_dim, label_bin_path,
label_start + cur_frame_pos * 4, cur_frame_len * label_start + cur_frame_pos * 4, cur_frame_len *
4, cur_frame_len)) 4, cur_frame_len, sample_name))
remain_frame_num -= cur_frame_len remain_frame_num -= cur_frame_len
cur_frame_pos += cur_frame_len cur_frame_pos += cur_frame_len
if remain_frame_num <= 0: if remain_frame_num <= 0:
break break
return sample_info_list return sample_info_list
class EpochEndSignal():
pass
class AsyncDataReader(object): class AsyncDataReader(object):
"""DataReader provides basic audio sample preprocessing pipeline including """DataReader provides basic audio sample preprocessing pipeline including
data loading and data augmentation. data loading and data augmentation.
...@@ -190,7 +202,7 @@ class AsyncDataReader(object): ...@@ -190,7 +202,7 @@ class AsyncDataReader(object):
def __init__(self, def __init__(self,
feature_file_list, feature_file_list,
label_file_list, label_file_list="",
drop_frame_len=512, drop_frame_len=512,
proc_num=10, proc_num=10,
sample_buffer_size=1024, sample_buffer_size=1024,
...@@ -213,27 +225,30 @@ class AsyncDataReader(object): ...@@ -213,27 +225,30 @@ class AsyncDataReader(object):
self._sample_info_buffer_size = sample_info_buffer_size self._sample_info_buffer_size = sample_info_buffer_size
self._batch_buffer_size = batch_buffer_size self._batch_buffer_size = batch_buffer_size
self._proc_num = proc_num self._proc_num = proc_num
if self._proc_num <= 2:
raise ValueError("Value of `proc_num` should be greater than 2.")
self._sample_proc_num = self._proc_num - 2
self._verbose = verbose self._verbose = verbose
self._force_exit = ForceExitWrapper(self._manager.Value('b', False)) self._force_exit = ForceExitWrapper(self._manager.Value('b', False))
self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size *
3, self._manager)
def generate_bucket_list(self, is_shuffle): def generate_bucket_list(self, is_shuffle):
if self._block_info_list is None: if self._block_info_list is None:
block_feature_info_lines = open(self._feature_file_list).readlines() block_feature_info_lines = open(self._feature_file_list).readlines()
block_label_info_lines = open(self._label_file_list).readlines()
assert len(block_feature_info_lines) == len(block_label_info_lines)
self._block_info_list = [] self._block_info_list = []
for i in xrange(0, len(block_feature_info_lines), 2): if self._label_file_list != "":
block_info = (block_feature_info_lines[i], block_label_info_lines = open(self._label_file_list).readlines()
block_feature_info_lines[i + 1], assert len(block_feature_info_lines) == len(
block_label_info_lines[i], block_label_info_lines)
block_label_info_lines[i + 1]) for i in xrange(0, len(block_feature_info_lines), 2):
self._block_info_list.append( block_info = (block_feature_info_lines[i],
map(lambda line: line.strip(), block_info)) block_feature_info_lines[i + 1],
block_label_info_lines[i],
block_label_info_lines[i + 1])
self._block_info_list.append(
map(lambda line: line.strip(), block_info))
else:
for i in xrange(0, len(block_feature_info_lines), 2):
block_info = (block_feature_info_lines[i],
block_feature_info_lines[i + 1], "", "")
self._block_info_list.append(
map(lambda line: line.strip(), block_info))
if is_shuffle: if is_shuffle:
self._rng.shuffle(self._block_info_list) self._rng.shuffle(self._block_info_list)
...@@ -253,23 +268,13 @@ class AsyncDataReader(object): ...@@ -253,23 +268,13 @@ class AsyncDataReader(object):
def set_transformers(self, transformers): def set_transformers(self, transformers):
self._transformers = transformers self._transformers = transformers
def recycle(self, *args): def _sample_generator(self):
for shared_ndarray in args:
if not isinstance(shared_ndarray, SharedNDArray):
raise Value("Only support recycle SharedNDArray object.")
shared_ndarray.recycle(self._pool_manager.pool)
def _start_async_processing(self):
sample_info_queue = self._manager.Queue(self._sample_info_buffer_size) sample_info_queue = self._manager.Queue(self._sample_info_buffer_size)
sample_queue = self._manager.Queue(self._sample_buffer_size) sample_queue = self._manager.Queue(self._sample_buffer_size)
self._order_id = 0 self._order_id = 0
@suppress_complaints(verbose=self._verbose, notify=self._force_exit) @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
def ordered_feeding_task(sample_info_queue): def ordered_feeding_task(sample_info_queue):
if self._verbose == 0:
signal.signal(signal.SIGTERM, suppress_signal)
signal.signal(signal.SIGINT, suppress_signal)
for sample_info_bucket in self._bucket_list: for sample_info_bucket in self._bucket_list:
try: try:
sample_info_list = \ sample_info_list = \
...@@ -282,12 +287,13 @@ class AsyncDataReader(object): ...@@ -282,12 +287,13 @@ class AsyncDataReader(object):
sample_info_queue.put((sample_info, self._order_id)) sample_info_queue.put((sample_info, self._order_id))
self._order_id += 1 self._order_id += 1
for i in xrange(self._sample_proc_num): for i in xrange(self._proc_num):
sample_info_queue.put(EpochEndSignal()) sample_info_queue.put(EpochEndSignal())
feeding_proc = DaemonProcessGroup( feeding_thread = Thread(
proc_num=1, target=ordered_feeding_task, args=(sample_info_queue, )) target=ordered_feeding_task, args=(sample_info_queue, ))
feeding_proc.start_all() feeding_thread.daemon = True
feeding_thread.start()
@suppress_complaints(verbose=self._verbose, notify=self._force_exit) @suppress_complaints(verbose=self._verbose, notify=self._force_exit)
def ordered_processing_task(sample_info_queue, sample_queue, out_order): def ordered_processing_task(sample_info_queue, sample_queue, out_order):
...@@ -315,25 +321,32 @@ class AsyncDataReader(object): ...@@ -315,25 +321,32 @@ class AsyncDataReader(object):
sample_info.feature_size) sample_info.feature_size)
assert sample_info.feature_frame_num \ assert sample_info.feature_frame_num \
* sample_info.feature_dim * 4 == len(feature_bytes), \ * sample_info.feature_dim * 4 \
(sample_info.feature_bin_path, == len(feature_bytes), \
sample_info.feature_frame_num, (sample_info.feature_bin_path,
sample_info.feature_dim, sample_info.feature_frame_num,
len(feature_bytes)) sample_info.feature_dim,
len(feature_bytes))
label_bytes = read_bytes(sample_info.label_bin_path,
sample_info.label_start, label_data = None
sample_info.label_size) if sample_info.label_bin_path != "":
label_bytes = read_bytes(sample_info.label_bin_path,
assert sample_info.label_frame_num * 4 == len(label_bytes), ( sample_info.label_start,
sample_info.label_bin_path, sample_info.label_array, sample_info.label_size)
len(label_bytes))
assert sample_info.label_frame_num * 4 == len(
label_array = struct.unpack('I' * sample_info.label_frame_num, label_bytes), (sample_info.label_bin_path,
label_bytes) sample_info.label_array,
label_data = np.array( len(label_bytes))
label_array, dtype='int64').reshape(
(sample_info.label_frame_num, 1)) label_array = struct.unpack(
'I' * sample_info.label_frame_num, label_bytes)
label_data = np.array(
label_array, dtype='int64').reshape(
(sample_info.label_frame_num, 1))
else:
label_data = np.zeros(
(sample_info.label_frame_num, 1), dtype='int64')
feature_frame_num = sample_info.feature_frame_num feature_frame_num = sample_info.feature_frame_num
feature_dim = sample_info.feature_dim feature_dim = sample_info.feature_dim
...@@ -343,12 +356,11 @@ class AsyncDataReader(object): ...@@ -343,12 +356,11 @@ class AsyncDataReader(object):
feature_data = np.array( feature_data = np.array(
feature_array, dtype='float32').reshape(( feature_array, dtype='float32').reshape((
sample_info.feature_frame_num, sample_info.feature_dim)) sample_info.feature_frame_num, sample_info.feature_dim))
sample_data = (feature_data, label_data,
sample_data = (feature_data, label_data) sample_info.sample_name)
for transformer in self._transformers: for transformer in self._transformers:
# @TODO(pkuyym) to make transfomer only accept feature_data # @TODO(pkuyym) to make transfomer only accept feature_data
sample_data = transformer.perform_trans(sample_data) sample_data = transformer.perform_trans(sample_data)
while order_id != out_order[0]: while order_id != out_order[0]:
time.sleep(0.001) time.sleep(0.001)
...@@ -364,71 +376,77 @@ class AsyncDataReader(object): ...@@ -364,71 +376,77 @@ class AsyncDataReader(object):
out_order = self._manager.list([0]) out_order = self._manager.list([0])
args = (sample_info_queue, sample_queue, out_order) args = (sample_info_queue, sample_queue, out_order)
sample_proc = DaemonProcessGroup( workers = [
proc_num=self._sample_proc_num, Process(
target=ordered_processing_task, target=ordered_processing_task, args=args)
args=args) for _ in xrange(self._proc_num)
sample_proc.start_all() ]
return sample_queue for w in workers:
w.daemon = True
w.start()
def batch_iterator(self, batch_size, minimum_batch_size): finished_proc_num = 0
@suppress_complaints(verbose=self._verbose, notify=self._force_exit)
def batch_assembling_task(sample_queue, batch_queue, pool): while self._force_exit == False:
def conv_to_shared(ndarray): try:
while self._force_exit == False: sample = sample_queue.get_nowait()
try: except Queue.Empty:
(name, shared_ndarray) = pool.popitem() time.sleep(0.001)
except Exception as e: else:
time.sleep(0.001) if isinstance(sample, EpochEndSignal):
finished_proc_num += 1
if finished_proc_num >= self._proc_num:
break
else: else:
shared_ndarray.copy(ndarray) continue
return shared_ndarray
if self._verbose == 0: yield sample
signal.signal(signal.SIGTERM, suppress_signal)
signal.signal(signal.SIGINT, suppress_signal)
def batch_iterator(self, batch_size, minimum_batch_size):
def batch_to_ndarray(batch_samples, lod):
assert len(batch_samples)
frame_dim = batch_samples[0][0].shape[1]
batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
batch_label = np.zeros((lod[-1], 1), dtype="int64")
start = 0
name_lst = []
for sample in batch_samples:
frame_num = sample[0].shape[0]
batch_feature[start:start + frame_num, :] = sample[0]
batch_label[start:start + frame_num, :] = sample[1]
start += frame_num
name_lst.append(sample[2])
return (batch_feature, batch_label, name_lst)
@suppress_complaints(verbose=self._verbose, notify=self._force_exit)
def batch_assembling_task(sample_generator, batch_queue):
batch_samples = [] batch_samples = []
lod = [0] lod = [0]
done_num = 0 for sample in sample_generator():
while done_num < self._sample_proc_num: batch_samples.append(sample)
sample = sample_queue.get() lod.append(lod[-1] + sample[0].shape[0])
if isinstance(sample, EpochEndSignal): if len(batch_samples) == batch_size:
done_num += 1 (batch_feature, batch_label, name_lst) = batch_to_ndarray(
else: batch_samples, lod)
batch_samples.append(sample) batch_queue.put((batch_feature, batch_label, lod, name_lst))
lod.append(lod[-1] + sample[0].shape[0]) batch_samples = []
if len(batch_samples) == batch_size: lod = [0]
feature, label = batch_to_ndarray(batch_samples, lod)
feature = conv_to_shared(feature)
label = conv_to_shared(label)
lod = conv_to_shared(np.array(lod).astype('int64'))
batch_queue.put((feature, label, lod))
batch_samples = []
lod = [0]
if len(batch_samples) >= minimum_batch_size: if len(batch_samples) >= minimum_batch_size:
(feature, label) = batch_to_ndarray(batch_samples, lod) (batch_feature, batch_label, name_lst) = batch_to_ndarray(
batch_samples, lod)
feature = conv_to_shared(feature) batch_queue.put((batch_feature, batch_label, lod, name_lst))
label = conv_to_shared(label)
lod = conv_to_shared(np.array(lod).astype('int64'))
batch_queue.put((feature, label, lod))
batch_queue.put(EpochEndSignal()) batch_queue.put(EpochEndSignal())
sample_queue = self._start_async_processing() batch_queue = Queue.Queue(self._batch_buffer_size)
batch_queue = self._manager.Queue(self._batch_buffer_size)
assembling_proc = DaemonProcessGroup( assembling_thread = Thread(
proc_num=1,
target=batch_assembling_task, target=batch_assembling_task,
args=(sample_queue, batch_queue, self._pool_manager.pool)) args=(self._sample_generator, batch_queue))
assembling_proc.start_all() assembling_thread.daemon = True
assembling_thread.start()
while self._force_exit == False: while self._force_exit == False:
try: try:
......
...@@ -22,7 +22,7 @@ class TestTransMeanVarianceNorm(unittest.TestCase): ...@@ -22,7 +22,7 @@ class TestTransMeanVarianceNorm(unittest.TestCase):
feature = np.zeros((2, 120), dtype="float32") feature = np.zeros((2, 120), dtype="float32")
feature.fill(1) feature.fill(1)
trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path) trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path)
(feature1, label1) = trans.perform_trans((feature, None)) (feature1, label1, name) = trans.perform_trans((feature, None, None))
(mean, var) = trans.get_mean_var() (mean, var) = trans.get_mean_var()
feature_flat1 = feature1.flatten() feature_flat1 = feature1.flatten()
feature_flat = feature.flatten() feature_flat = feature.flatten()
...@@ -70,7 +70,7 @@ class TestTransAddDelta(unittest.TestCase): ...@@ -70,7 +70,7 @@ class TestTransAddDelta(unittest.TestCase):
feature[2, 0:40].fill(3) feature[2, 0:40].fill(3)
feature[3, 0:40].fill(4) feature[3, 0:40].fill(4)
trans = trans_add_delta.TransAddDelta() trans = trans_add_delta.TransAddDelta()
(feature, label) = trans.perform_trans((feature, None)) (feature, label, name) = trans.perform_trans((feature, None, None))
self.assertAlmostEqual(feature.shape[0], 4) self.assertAlmostEqual(feature.shape[0], 4)
self.assertAlmostEqual(feature.shape[1], 120) self.assertAlmostEqual(feature.shape[1], 120)
self.assertAlmostEqual(1.0, feature[0][0]) self.assertAlmostEqual(1.0, feature[0][0])
...@@ -93,7 +93,7 @@ class TestTransSplict(unittest.TestCase): ...@@ -93,7 +93,7 @@ class TestTransSplict(unittest.TestCase):
feature[i, :].fill(i) feature[i, :].fill(i)
trans = trans_splice.TransSplice() trans = trans_splice.TransSplice()
(feature, label) = trans.perform_trans((feature, None)) (feature, label, name) = trans.perform_trans((feature, None, None))
self.assertEqual(feature.shape[1], 110) self.assertEqual(feature.shape[1], 110)
for i in xrange(8): for i in xrange(8):
......
...@@ -32,9 +32,9 @@ class TransAddDelta(object): ...@@ -32,9 +32,9 @@ class TransAddDelta(object):
Args: Args:
sample(object,tuple): contain feature numpy and label numpy sample(object,tuple): contain feature numpy and label numpy
Returns: Returns:
(feature, label) (feature, label, name)
""" """
(feature, label) = sample (feature, label, name) = sample
frame_dim = feature.shape[1] frame_dim = feature.shape[1]
d_frame_dim = frame_dim * 3 d_frame_dim = frame_dim * 3
head_filled = 5 head_filled = 5
...@@ -64,7 +64,7 @@ class TransAddDelta(object): ...@@ -64,7 +64,7 @@ class TransAddDelta(object):
start * d_frame_dim + 2 * frame_dim, frame_dim, nframe, start * d_frame_dim + 2 * frame_dim, frame_dim, nframe,
d_frame_dim) d_frame_dim)
mat.shape = tmp_shape mat.shape = tmp_shape
return (mat[head_filled:mat.shape[0] - tail_filled, :], label) return (mat[head_filled:mat.shape[0] - tail_filled, :], label, name)
def _regress(self, data_in, start_in, data_out, start_out, size, n, step): def _regress(self, data_in, start_in, data_out, start_out, size, n, step):
""" regress """ regress
......
...@@ -53,9 +53,9 @@ class TransMeanVarianceNorm(object): ...@@ -53,9 +53,9 @@ class TransMeanVarianceNorm(object):
Args: Args:
sample(object):input sample, contain feature numpy and label numpy sample(object):input sample, contain feature numpy and label numpy
Returns: Returns:
(feature, label) (feature, label, name)
""" """
(feature, label) = sample (feature, label, name) = sample
shape = feature.shape shape = feature.shape
assert len(shape) == 2 assert len(shape) == 2
nfeature_len = shape[0] * shape[1] nfeature_len = shape[0] * shape[1]
...@@ -68,4 +68,4 @@ class TransMeanVarianceNorm(object): ...@@ -68,4 +68,4 @@ class TransMeanVarianceNorm(object):
feature[ncur_idx:ncur_idx + self._nLen] = block feature[ncur_idx:ncur_idx + self._nLen] = block
ncur_idx += self._nLen ncur_idx += self._nLen
feature = feature.reshape(shape) feature = feature.reshape(shape)
return (feature, label) return (feature, label, name)
...@@ -30,9 +30,9 @@ class TransSplice(object): ...@@ -30,9 +30,9 @@ class TransSplice(object):
Args: Args:
sample(object): input sample(feature, label) sample(object): input sample(feature, label)
Return: Return:
(feature, label) (feature, label, name)
""" """
(feature, label) = sample (feature, label, name) = sample
nframe_num = feature.shape[0] nframe_num = feature.shape[0]
nframe_dim = feature.shape[1] nframe_dim = feature.shape[1]
nnew_frame_dim = nframe_dim * ( nnew_frame_dim = nframe_dim * (
...@@ -61,4 +61,4 @@ class TransSplice(object): ...@@ -61,4 +61,4 @@ class TransSplice(object):
np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim], np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim],
mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim]) mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim])
ret = ret.reshape((nframe_num, nnew_frame_dim)) ret = ret.reshape((nframe_num, nnew_frame_dim))
return (ret, label) return (ret, label, name)
...@@ -4,8 +4,6 @@ from __future__ import print_function ...@@ -4,8 +4,6 @@ from __future__ import print_function
import sys import sys
from six import reraise from six import reraise
from tblib import Traceback from tblib import Traceback
from multiprocessing import Manager, Process
import posix_ipc, mmap
import numpy as np import numpy as np
...@@ -37,19 +35,6 @@ def lodtensor_to_ndarray(lod_tensor): ...@@ -37,19 +35,6 @@ def lodtensor_to_ndarray(lod_tensor):
return ret, lod_tensor.lod() return ret, lod_tensor.lod()
def batch_to_ndarray(batch_samples, lod):
frame_dim = batch_samples[0][0].shape[1]
batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32")
batch_label = np.zeros((lod[-1], 1), dtype="int64")
start = 0
for sample in batch_samples:
frame_num = sample[0].shape[0]
batch_feature[start:start + frame_num, :] = sample[0]
batch_label[start:start + frame_num, :] = sample[1]
start += frame_num
return (batch_feature, batch_label)
def split_infer_result(infer_seq, lod): def split_infer_result(infer_seq, lod):
infer_batch = [] infer_batch = []
for i in xrange(0, len(lod[0]) - 1): for i in xrange(0, len(lod[0]) - 1):
...@@ -57,126 +42,10 @@ def split_infer_result(infer_seq, lod): ...@@ -57,126 +42,10 @@ def split_infer_result(infer_seq, lod):
return infer_batch return infer_batch
class DaemonProcessGroup(object):
def __init__(self, proc_num, target, args):
self._proc_num = proc_num
self._workers = [
Process(
target=target, args=args) for _ in xrange(self._proc_num)
]
def start_all(self):
for w in self._workers:
w.daemon = True
w.start()
@property
def proc_num(self):
return self._proc_num
class EpochEndSignal(object):
pass
class CriticalException(Exception): class CriticalException(Exception):
pass pass
class SharedNDArray(object):
"""SharedNDArray utilizes shared memory to avoid data serialization when
data object shared among different processes. We can reconstruct the
`ndarray` when memory address, shape and dtype provided.
Args:
name (str): Address name of shared memory.
whether_verify (bool): Whether to validate the writing operation.
"""
def __init__(self, name, whether_verify=False):
self._name = name
self._shm = None
self._buf = None
self._array = np.zeros(1, dtype=np.float32)
self._inited = False
self._whether_verify = whether_verify
def zeros_like(self, shape, dtype):
size = int(np.prod(shape)) * np.dtype(dtype).itemsize
if self._inited:
self._shm = posix_ipc.SharedMemory(self._name)
else:
self._shm = posix_ipc.SharedMemory(
self._name, posix_ipc.O_CREAT, size=size)
self._buf = mmap.mmap(self._shm.fd, size)
self._array = np.ndarray(shape, dtype, self._buf, order='C')
def copy(self, ndarray):
size = int(np.prod(ndarray.shape)) * np.dtype(ndarray.dtype).itemsize
self.zeros_like(ndarray.shape, ndarray.dtype)
self._array[:] = ndarray
self._buf.flush()
self._inited = True
if self._whether_verify:
shm = posix_ipc.SharedMemory(self._name)
buf = mmap.mmap(shm.fd, size)
array = np.ndarray(ndarray.shape, ndarray.dtype, buf, order='C')
np.testing.assert_array_equal(array, ndarray)
@property
def ndarray(self):
return self._array
def recycle(self, pool):
self._buf.close()
self._shm.close_fd()
self._inited = False
pool[self._name] = self
def __getstate__(self):
return (self._name, self._array.shape, self._array.dtype, self._inited,
self._whether_verify)
def __setstate__(self, state):
self._name = state[0]
self._inited = state[3]
self.zeros_like(state[1], state[2])
self._whether_verify = state[4]
class SharedMemoryPoolManager(object):
"""SharedMemoryPoolManager maintains a multiprocessing.Manager.dict object.
All available addresses are allocated once and will be reused. Though this
class is not process-safe, the pool can be shared between processes. All
shared memory should be unlinked before the main process exited.
Args:
pool_size (int): Size of shared memory pool.
manager (dict): A multiprocessing.Manager object, the pool is
maintained by the proxy process.
name_prefix (str): Address prefix of shared memory.
"""
def __init__(self, pool_size, manager, name_prefix='/deep_asr'):
self._names = []
self._dict = manager.dict()
for i in xrange(pool_size):
name = name_prefix + '_' + str(i)
self._dict[name] = SharedNDArray(name)
self._names.append(name)
@property
def pool(self):
return self._dict
def __del__(self):
for name in self._names:
# have to unlink the shared memory
posix_ipc.unlink_shared_memory(name)
def suppress_signal(signo, stack_frame): def suppress_signal(signo, stack_frame):
pass pass
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "decoder.h"
std::string decode(std::vector<std::vector<float>> probs_mat) {
// Add decoding logic here
return "example decoding result";
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "post_decode_faster.h"
typedef kaldi::int32 int32;
using fst::SymbolTable;
using fst::VectorFst;
using fst::StdArc;
Decoder::Decoder(std::string word_syms_filename,
std::string fst_in_filename,
std::string logprior_rxfilename,
kaldi::BaseFloat acoustic_scale) {
const char* usage =
"Decode, reading log-likelihoods (of transition-ids or whatever symbol "
"is on the graph) as matrices.";
kaldi::ParseOptions po(usage);
binary = true;
this->acoustic_scale = acoustic_scale;
allow_partial = true;
kaldi::FasterDecoderOptions decoder_opts;
decoder_opts.Register(&po, true); // true == include obscure settings.
po.Register("binary", &binary, "Write output in binary mode");
po.Register("allow-partial",
&allow_partial,
"Produce output even when final state was not reached");
po.Register("acoustic-scale",
&acoustic_scale,
"Scaling factor for acoustic likelihoods");
word_syms = NULL;
if (word_syms_filename != "") {
word_syms = fst::SymbolTable::ReadText(word_syms_filename);
if (!word_syms)
KALDI_ERR << "Could not read symbol table from file "
<< word_syms_filename;
}
std::ifstream is_logprior(logprior_rxfilename);
logprior.Read(is_logprior, false);
// It's important that we initialize decode_fst after loglikes_reader, as it
// can prevent crashes on systems installed without enough virtual memory.
// It has to do with what happens on UNIX systems if you call fork() on a
// large process: the page-table entries are duplicated, which requires a
// lot of virtual memory.
decode_fst = fst::ReadFstKaldi(fst_in_filename);
decoder = new kaldi::FasterDecoder(*decode_fst, decoder_opts);
}
Decoder::~Decoder() {
if (!word_syms) delete word_syms;
delete decode_fst;
delete decoder;
}
std::string Decoder::decode(
std::string key,
const std::vector<std::vector<kaldi::BaseFloat>>& log_probs) {
size_t num_frames = log_probs.size();
size_t dim_label = log_probs[0].size();
kaldi::Matrix<kaldi::BaseFloat> loglikes(
num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols);
for (size_t i = 0; i < num_frames; ++i) {
memcpy(loglikes.Data() + i * dim_label,
log_probs[i].data(),
sizeof(kaldi::BaseFloat) * dim_label);
}
return decode(key, loglikes);
}
std::vector<std::string> Decoder::decode(std::string posterior_rspecifier) {
kaldi::SequentialBaseFloatMatrixReader posterior_reader(posterior_rspecifier);
std::vector<std::string> decoding_results;
for (; !posterior_reader.Done(); posterior_reader.Next()) {
std::string key = posterior_reader.Key();
kaldi::Matrix<kaldi::BaseFloat> loglikes(posterior_reader.Value());
decoding_results.push_back(decode(key, loglikes));
}
return decoding_results;
}
std::string Decoder::decode(std::string key,
kaldi::Matrix<kaldi::BaseFloat>& loglikes) {
std::string decoding_result;
if (loglikes.NumRows() == 0) {
KALDI_WARN << "Zero-length utterance: " << key;
}
KALDI_ASSERT(loglikes.NumCols() == logprior.Dim());
loglikes.ApplyLog();
loglikes.AddVecToRows(-1.0, logprior);
kaldi::DecodableMatrixScaled decodable(loglikes, acoustic_scale);
decoder->Decode(&decodable);
VectorFst<kaldi::LatticeArc> decoded; // linear FST.
if ((allow_partial || decoder->ReachedFinal()) &&
decoder->GetBestPath(&decoded)) {
if (!decoder->ReachedFinal())
KALDI_WARN << "Decoder did not reach end-state, outputting partial "
"traceback.";
std::vector<int32> alignment;
std::vector<int32> words;
kaldi::LatticeWeight weight;
GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
if (word_syms != NULL) {
for (size_t i = 0; i < words.size(); i++) {
std::string s = word_syms->Find(words[i]);
decoding_result += s;
if (s == "")
KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
}
}
}
return decoding_result;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -14,5 +14,45 @@ limitations under the License. */ ...@@ -14,5 +14,45 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "base/kaldi-common.h"
#include "base/timer.h"
#include "decoder/decodable-matrix.h"
#include "decoder/faster-decoder.h"
#include "fstext/fstext-lib.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc
#include "tree/context-dep.h"
#include "util/common-utils.h"
std::string decode(std::vector<std::vector<float>> probs_mat);
class Decoder {
public:
Decoder(std::string word_syms_filename,
std::string fst_in_filename,
std::string logprior_rxfilename,
kaldi::BaseFloat acoustic_scale);
~Decoder();
// Interface to accept the scores read from specifier and return
// the batch decoding results
std::vector<std::string> decode(std::string posterior_rspecifier);
// Accept the scores of one utterance and return the decoding result
std::string decode(
std::string key,
const std::vector<std::vector<kaldi::BaseFloat>> &log_probs);
private:
// For decoding one utterance
std::string decode(std::string key,
kaldi::Matrix<kaldi::BaseFloat> &loglikes);
fst::SymbolTable *word_syms;
fst::VectorFst<fst::StdArc> *decode_fst;
kaldi::FasterDecoder *decoder;
kaldi::Vector<kaldi::BaseFloat> logprior;
bool binary;
kaldi::BaseFloat acoustic_scale;
bool allow_partial;
};
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -15,15 +15,25 @@ limitations under the License. */ ...@@ -15,15 +15,25 @@ limitations under the License. */
#include <pybind11/pybind11.h> #include <pybind11/pybind11.h>
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include "decoder.h" #include "post_decode_faster.h"
namespace py = pybind11; namespace py = pybind11;
PYBIND11_MODULE(decoder, m) { PYBIND11_MODULE(post_decode_faster, m) {
m.doc() = "Decode function for Deep ASR model"; m.doc() = "Decoder for Deep ASR model";
m.def("decode", py::class_<Decoder>(m, "Decoder")
&decode, .def(py::init<std::string, std::string, std::string, kaldi::BaseFloat>())
"Decode one input probability matrix " .def("decode",
"and return the transcription"); (std::vector<std::string> (Decoder::*)(std::string)) &
Decoder::decode,
"Decode for the probability matrices in specifier "
"and return the transcriptions.")
.def(
"decode",
(std::string (Decoder::*)(
std::string, const std::vector<std::vector<kaldi::BaseFloat>>&)) &
Decoder::decode,
"Decode one input probability matrix "
"and return the transcription.");
} }
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -13,27 +13,57 @@ ...@@ -13,27 +13,57 @@
# limitations under the License. # limitations under the License.
import os import os
import glob
from distutils.core import setup, Extension from distutils.core import setup, Extension
from distutils.sysconfig import get_config_vars from distutils.sysconfig import get_config_vars
args = ['-std=c++11'] try:
kaldi_root = os.environ['KALDI_ROOT']
except:
raise ValueError("Enviroment variable 'KALDI_ROOT' is not defined. Please "
"install kaldi and export KALDI_ROOT=<kaldi's root dir> .")
args = [
'-std=c++11', '-Wno-sign-compare', '-Wno-unused-variable',
'-Wno-unused-local-typedefs', '-Wno-unused-but-set-variable',
'-Wno-deprecated-declarations', '-Wno-unused-function'
]
# remove warning about -Wstrict-prototypes # remove warning about -Wstrict-prototypes
(opt, ) = get_config_vars('OPT') (opt, ) = get_config_vars('OPT')
os.environ['OPT'] = " ".join(flag for flag in opt.split() os.environ['OPT'] = " ".join(flag for flag in opt.split()
if flag != '-Wstrict-prototypes') if flag != '-Wstrict-prototypes')
os.environ['CC'] = 'g++'
LIBS = [
'fst', 'kaldi-base', 'kaldi-util', 'kaldi-matrix', 'kaldi-tree',
'kaldi-hmm', 'kaldi-fstext', 'kaldi-decoder', 'kaldi-lat'
]
LIB_DIRS = [
'tools/openfst/lib', 'src/base', 'src/matrix', 'src/util', 'src/tree',
'src/hmm', 'src/fstext', 'src/decoder', 'src/lat'
]
LIB_DIRS = [os.path.join(kaldi_root, path) for path in LIB_DIRS]
LIB_DIRS = [os.path.abspath(path) for path in LIB_DIRS]
ext_modules = [ ext_modules = [
Extension( Extension(
'decoder', 'post_decode_faster',
['pybind.cc', 'decoder.cc'], ['pybind.cc', 'post_decode_faster.cc'],
include_dirs=['pybind11/include', '.'], include_dirs=[
'pybind11/include', '.', os.path.join(kaldi_root, 'src'),
os.path.join(kaldi_root, 'tools/openfst/src/include')
],
language='c++', language='c++',
libraries=LIBS,
library_dirs=LIB_DIRS,
runtime_library_dirs=LIB_DIRS,
extra_compile_args=args, ), extra_compile_args=args, ),
] ]
setup( setup(
name='decoder', name='post_decode_faster',
version='0.0.1', version='0.0.1',
author='Paddle', author='Paddle',
author_email='', author_email='',
......
set -e
if [ ! -d pybind11 ]; then if [ ! -d pybind11 ]; then
git clone https://github.com/pybind/pybind11.git git clone https://github.com/pybind/pybind11.git
......
...@@ -8,7 +8,7 @@ import paddle.fluid as fluid ...@@ -8,7 +8,7 @@ import paddle.fluid as fluid
import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.augmentor.trans_add_delta as trans_add_delta import data_utils.augmentor.trans_add_delta as trans_add_delta
import data_utils.augmentor.trans_splice as trans_splice import data_utils.augmentor.trans_splice as trans_splice
import data_utils.data_reader as reader import data_utils.async_data_reader as reader
from data_utils.util import lodtensor_to_ndarray from data_utils.util import lodtensor_to_ndarray
from data_utils.util import split_infer_result from data_utils.util import split_infer_result
...@@ -79,12 +79,13 @@ def infer(args): ...@@ -79,12 +79,13 @@ def infer(args):
trans_splice.TransSplice() trans_splice.TransSplice()
] ]
infer_data_reader = reader.DataReader(args.infer_feature_lst, infer_data_reader = reader.AsyncDataReader(args.infer_feature_lst,
args.infer_label_lst) args.infer_label_lst)
infer_data_reader.set_transformers(ltrans) infer_data_reader.set_transformers(ltrans)
feature_t = fluid.LoDTensor() feature_t = fluid.LoDTensor()
one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next() one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next()
(features, labels, lod) = one_batch (features, labels, lod) = one_batch
feature_t.set(features, place) feature_t.set(features, place)
feature_t.set_lod([lod]) feature_t.set_lod([lod])
......
...@@ -13,7 +13,7 @@ import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm ...@@ -13,7 +13,7 @@ import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.augmentor.trans_add_delta as trans_add_delta import data_utils.augmentor.trans_add_delta as trans_add_delta
import data_utils.augmentor.trans_splice as trans_splice import data_utils.augmentor.trans_splice as trans_splice
import data_utils.async_data_reader as reader import data_utils.async_data_reader as reader
import decoder.decoder as decoder from decoder.post_decode_faster import Decoder
from data_utils.util import lodtensor_to_ndarray from data_utils.util import lodtensor_to_ndarray
from model_utils.model import stacked_lstmp_model from model_utils.model import stacked_lstmp_model
from data_utils.util import split_infer_result from data_utils.util import split_infer_result
...@@ -32,6 +32,11 @@ def parse_args(): ...@@ -32,6 +32,11 @@ def parse_args():
default=1, default=1,
help='The minimum sequence number of a batch data. ' help='The minimum sequence number of a batch data. '
'(default: %(default)d)') '(default: %(default)d)')
parser.add_argument(
'--frame_dim',
type=int,
default=120 * 11,
help='Frame dimension of feature data. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--stacked_num', '--stacked_num',
type=int, type=int,
...@@ -47,6 +52,11 @@ def parse_args(): ...@@ -47,6 +52,11 @@ def parse_args():
type=int, type=int,
default=1024, default=1024,
help='Hidden size of lstmp unit. (default: %(default)d)') help='Hidden size of lstmp unit. (default: %(default)d)')
parser.add_argument(
'--class_num',
type=int,
default=1749,
help='Number of classes in label. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--learning_rate', '--learning_rate',
type=float, type=float,
...@@ -81,6 +91,26 @@ def parse_args(): ...@@ -81,6 +91,26 @@ def parse_args():
type=str, type=str,
default='./checkpoint', default='./checkpoint',
help="The checkpoint path to init model. (default: %(default)s)") help="The checkpoint path to init model. (default: %(default)s)")
parser.add_argument(
'--vocabulary',
type=str,
default='./decoder/graph/words.txt',
help="The path to vocabulary. (default: %(default)s)")
parser.add_argument(
'--graphs',
type=str,
default='./decoder/graph/TLG.fst',
help="The path to TLG graphs for decoding. (default: %(default)s)")
parser.add_argument(
'--log_prior',
type=str,
default="./decoder/logprior",
help="The log prior probs for training data. (default: %(default)s)")
parser.add_argument(
'--acoustic_scale',
type=float,
default=0.2,
help="Scaling factor for acoustic likelihoods. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -99,10 +129,11 @@ def infer_from_ckpt(args): ...@@ -99,10 +129,11 @@ def infer_from_ckpt(args):
raise IOError("Invalid checkpoint!") raise IOError("Invalid checkpoint!")
prediction, avg_cost, accuracy = stacked_lstmp_model( prediction, avg_cost, accuracy = stacked_lstmp_model(
frame_dim=args.frame_dim,
hidden_dim=args.hidden_dim, hidden_dim=args.hidden_dim,
proj_dim=args.proj_dim, proj_dim=args.proj_dim,
stacked_num=args.stacked_num, stacked_num=args.stacked_num,
class_num=1749, class_num=args.class_num,
parallel=args.parallel) parallel=args.parallel)
infer_program = fluid.default_main_program().clone() infer_program = fluid.default_main_program().clone()
...@@ -117,6 +148,10 @@ def infer_from_ckpt(args): ...@@ -117,6 +148,10 @@ def infer_from_ckpt(args):
# load checkpoint. # load checkpoint.
fluid.io.load_persistables(exe, args.checkpoint) fluid.io.load_persistables(exe, args.checkpoint)
# init decoder
decoder = Decoder(args.vocabulary, args.graphs, args.log_prior,
args.acoustic_scale)
ltrans = [ ltrans = [
trans_add_delta.TransAddDelta(2, 2), trans_add_delta.TransAddDelta(2, 2),
trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var), trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
...@@ -136,12 +171,10 @@ def infer_from_ckpt(args): ...@@ -136,12 +171,10 @@ def infer_from_ckpt(args):
args.minimum_batch_size)): args.minimum_batch_size)):
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod) = batch_data
feature_t.set(features.ndarray, place) feature_t.set(features, place)
feature_t.set_lod([lod.ndarray]) feature_t.set_lod([lod])
label_t.set(labels.ndarray, place) label_t.set(labels, place)
label_t.set_lod([lod.ndarray]) label_t.set_lod([lod])
infer_data_reader.recycle(features, labels, lod)
results = exe.run(infer_program, results = exe.run(infer_program,
feed={"feature": feature_t, feed={"feature": feature_t,
...@@ -154,8 +187,8 @@ def infer_from_ckpt(args): ...@@ -154,8 +187,8 @@ def infer_from_ckpt(args):
probs, lod = lodtensor_to_ndarray(results[0]) probs, lod = lodtensor_to_ndarray(results[0])
infer_batch = split_infer_result(probs, lod) infer_batch = split_infer_result(probs, lod)
for index, sample in enumerate(infer_batch): for index, sample in enumerate(infer_batch):
print("Decoding %d: " % (batch_id * args.batch_size + index), key = "utter#%d" % (batch_id * args.batch_size + index)
decoder.decode(sample)) print(key, ": ", decoder.decode(key, sample).encode("utf8"), "\n")
print(np.mean(infer_costs), np.mean(infer_accs)) print(np.mean(infer_costs), np.mean(infer_accs))
......
...@@ -6,7 +6,8 @@ import paddle.v2 as paddle ...@@ -6,7 +6,8 @@ import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
def stacked_lstmp_model(hidden_dim, def stacked_lstmp_model(frame_dim,
hidden_dim,
proj_dim, proj_dim,
stacked_num, stacked_num,
class_num, class_num,
...@@ -20,12 +21,13 @@ def stacked_lstmp_model(hidden_dim, ...@@ -20,12 +21,13 @@ def stacked_lstmp_model(hidden_dim,
label data respectively. And in inference, only `feature` is needed. label data respectively. And in inference, only `feature` is needed.
Args: Args:
hidden_dim(int): The hidden state's dimension of the LSTMP layer. frame_dim(int): The frame dimension of feature data.
proj_dim(int): The projection size of the LSTMP layer. hidden_dim(int): The hidden state's dimension of the LSTMP layer.
stacked_num(int): The number of stacked LSTMP layers. proj_dim(int): The projection size of the LSTMP layer.
parallel(bool): Run in parallel or not, default `False`. stacked_num(int): The number of stacked LSTMP layers.
is_train(bool): Run in training phase or not, default `True`. parallel(bool): Run in parallel or not, default `False`.
class_dim(int): The number of output classes. is_train(bool): Run in training phase or not, default `True`.
class_dim(int): The number of output classes.
""" """
# network configuration # network configuration
...@@ -78,7 +80,7 @@ def stacked_lstmp_model(hidden_dim, ...@@ -78,7 +80,7 @@ def stacked_lstmp_model(hidden_dim,
# data feeder # data feeder
feature = fluid.layers.data( feature = fluid.layers.data(
name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1) name="feature", shape=[-1, frame_dim], dtype="float32", lod_level=1)
label = fluid.layers.data( label = fluid.layers.data(
name="label", shape=[-1, 1], dtype="int64", lod_level=1) name="label", shape=[-1, 1], dtype="int64", lod_level=1)
...@@ -92,11 +94,12 @@ def stacked_lstmp_model(hidden_dim, ...@@ -92,11 +94,12 @@ def stacked_lstmp_model(hidden_dim,
feat_ = pd.read_input(feature) feat_ = pd.read_input(feature)
label_ = pd.read_input(label) label_ = pd.read_input(label)
prediction, avg_cost, acc = _net_conf(feat_, label_) prediction, avg_cost, acc = _net_conf(feat_, label_)
for out in [avg_cost, acc]: for out in [prediction, avg_cost, acc]:
pd.write_output(out) pd.write_output(out)
# get mean loss and acc through every devices. # get mean loss and acc through every devices.
avg_cost, acc = pd() prediction, avg_cost, acc = pd()
prediction.stop_gradient = True
avg_cost = fluid.layers.mean(x=avg_cost) avg_cost = fluid.layers.mean(x=avg_cost)
acc = fluid.layers.mean(x=acc) acc = fluid.layers.mean(x=acc)
else: else:
......
...@@ -31,6 +31,11 @@ def parse_args(): ...@@ -31,6 +31,11 @@ def parse_args():
default=1, default=1,
help='The minimum sequence number of a batch data. ' help='The minimum sequence number of a batch data. '
'(default: %(default)d)') '(default: %(default)d)')
parser.add_argument(
'--frame_dim',
type=int,
default=120 * 11,
help='Frame dimension of feature data. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--stacked_num', '--stacked_num',
type=int, type=int,
...@@ -46,6 +51,11 @@ def parse_args(): ...@@ -46,6 +51,11 @@ def parse_args():
type=int, type=int,
default=1024, default=1024,
help='Hidden size of lstmp unit. (default: %(default)d)') help='Hidden size of lstmp unit. (default: %(default)d)')
parser.add_argument(
'--class_num',
type=int,
default=1749,
help='Number of classes in label. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--learning_rate', '--learning_rate',
type=float, type=float,
...@@ -119,10 +129,11 @@ def profile(args): ...@@ -119,10 +129,11 @@ def profile(args):
"arg 'first_batches_to_skip' must not be smaller than 0.") "arg 'first_batches_to_skip' must not be smaller than 0.")
_, avg_cost, accuracy = stacked_lstmp_model( _, avg_cost, accuracy = stacked_lstmp_model(
frame_dim=args.frame_dim,
hidden_dim=args.hidden_dim, hidden_dim=args.hidden_dim,
proj_dim=args.proj_dim, proj_dim=args.proj_dim,
stacked_num=args.stacked_num, stacked_num=args.stacked_num,
class_num=1749, class_num=args.class_num,
parallel=args.parallel) parallel=args.parallel)
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
...@@ -158,14 +169,12 @@ def profile(args): ...@@ -158,14 +169,12 @@ def profile(args):
frames_seen = 0 frames_seen = 0
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod) = batch_data
feature_t.set(features.ndarray, place) feature_t.set(features, place)
feature_t.set_lod([lod.ndarray]) feature_t.set_lod([lod])
label_t.set(labels.ndarray, place) label_t.set(labels, place)
label_t.set_lod([lod.ndarray]) label_t.set_lod([lod])
frames_seen += lod.ndarray[-1]
data_reader.recycle(features, labels, lod) frames_seen += lod[-1]
outs = exe.run(fluid.default_main_program(), outs = exe.run(fluid.default_main_program(),
feed={"feature": feature_t, feed={"feature": feature_t,
......
...@@ -30,6 +30,11 @@ def parse_args(): ...@@ -30,6 +30,11 @@ def parse_args():
default=1, default=1,
help='The minimum sequence number of a batch data. ' help='The minimum sequence number of a batch data. '
'(default: %(default)d)') '(default: %(default)d)')
parser.add_argument(
'--frame_dim',
type=int,
default=120 * 11,
help='Frame dimension of feature data. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--stacked_num', '--stacked_num',
type=int, type=int,
...@@ -45,6 +50,11 @@ def parse_args(): ...@@ -45,6 +50,11 @@ def parse_args():
type=int, type=int,
default=1024, default=1024,
help='Hidden size of lstmp unit. (default: %(default)d)') help='Hidden size of lstmp unit. (default: %(default)d)')
parser.add_argument(
'--class_num',
type=int,
default=1749,
help='Number of classes in label. (default: %(default)d)')
parser.add_argument( parser.add_argument(
'--pass_num', '--pass_num',
type=int, type=int,
...@@ -137,10 +147,11 @@ def train(args): ...@@ -137,10 +147,11 @@ def train(args):
os.mkdir(args.infer_models) os.mkdir(args.infer_models)
prediction, avg_cost, accuracy = stacked_lstmp_model( prediction, avg_cost, accuracy = stacked_lstmp_model(
frame_dim=args.frame_dim,
hidden_dim=args.hidden_dim, hidden_dim=args.hidden_dim,
proj_dim=args.proj_dim, proj_dim=args.proj_dim,
stacked_num=args.stacked_num, stacked_num=args.stacked_num,
class_num=1749, class_num=args.class_num,
parallel=args.parallel) parallel=args.parallel)
# program for test # program for test
...@@ -182,12 +193,10 @@ def train(args): ...@@ -182,12 +193,10 @@ def train(args):
args.minimum_batch_size)): args.minimum_batch_size)):
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod) = batch_data
feature_t.set(features.ndarray, place) feature_t.set(features, place)
feature_t.set_lod([lod.ndarray]) feature_t.set_lod([lod])
label_t.set(labels.ndarray, place) label_t.set(labels, place)
label_t.set_lod([lod.ndarray]) label_t.set_lod([lod])
test_data_reader.recycle(features, labels, lod)
cost, acc = exe.run(test_program, cost, acc = exe.run(test_program,
feed={"feature": feature_t, feed={"feature": feature_t,
...@@ -201,6 +210,7 @@ def train(args): ...@@ -201,6 +210,7 @@ def train(args):
# train data reader # train data reader
train_data_reader = reader.AsyncDataReader(args.train_feature_lst, train_data_reader = reader.AsyncDataReader(args.train_feature_lst,
args.train_label_lst, -1) args.train_label_lst, -1)
train_data_reader.set_transformers(ltrans) train_data_reader.set_transformers(ltrans)
# train # train
for pass_id in xrange(args.pass_num): for pass_id in xrange(args.pass_num):
...@@ -209,13 +219,11 @@ def train(args): ...@@ -209,13 +219,11 @@ def train(args):
train_data_reader.batch_iterator(args.batch_size, train_data_reader.batch_iterator(args.batch_size,
args.minimum_batch_size)): args.minimum_batch_size)):
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod, name_lst) = batch_data
feature_t.set(features.ndarray, place) feature_t.set(features, place)
feature_t.set_lod([lod.ndarray]) feature_t.set_lod([lod])
label_t.set(labels.ndarray, place) label_t.set(labels, place)
label_t.set_lod([lod.ndarray]) label_t.set_lod([lod])
train_data_reader.recycle(features, labels, lod)
to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0) to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
outs = exe.run(fluid.default_main_program(), outs = exe.run(fluid.default_main_program(),
......
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
This tool is used to convert a Caffe model to Fluid model This tool is used to convert a Caffe model to Fluid model
### Howto ### Howto
1, Prepare caffepb.py in ./proto, two options provided 1, Prepare caffepb.py in ./proto if your python has no 'pycaffe' module, two options provided here:
1) generate it from caffe.proto using protoc 1) generate it from caffe.proto using protoc
bash ./proto/compile.sh bash ./proto/compile.sh
...@@ -12,14 +13,24 @@ This tool is used to convert a Caffe model to Fluid model ...@@ -12,14 +13,24 @@ This tool is used to convert a Caffe model to Fluid model
2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file 2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file
3, Use the converted model to predict 3, Use the converted model to predict
see more detail info in 'tests/lenet/README.md'
see more detail info in 'examples/xxx'
### Supported models ### Tested models
- Lenet on mnist dataset - Lenet on mnist dataset
- ResNets:(ResNet-50, ResNet-101, ResNet-152) - ResNets:(ResNet-50, ResNet-101, ResNet-152)
model addrs:(https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777) model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_
- GoogleNet:
model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_
- VGG:
model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_
- AlexNet:
model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_
### Notes ### Notes
Some of this code come from here: https://github.com/ethereon/caffe-tensorflow Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
...@@ -4,8 +4,8 @@ import os ...@@ -4,8 +4,8 @@ import os
import sys import sys
import numpy as np import numpy as np
import argparse import argparse
from kaffe import KaffeError, print_stderr
from kaffe import KaffeError, print_stderr
from kaffe.paddle import Transformer from kaffe.paddle import Transformer
...@@ -47,6 +47,8 @@ def convert(def_path, caffemodel_path, data_output_path, code_output_path, ...@@ -47,6 +47,8 @@ def convert(def_path, caffemodel_path, data_output_path, code_output_path,
except KaffeError as err: except KaffeError as err:
fatal_error('Error encountered: {}'.format(err)) fatal_error('Error encountered: {}'.format(err))
return 0
def main(): def main():
""" main """ main
...@@ -64,9 +66,10 @@ def main(): ...@@ -64,9 +66,10 @@ def main():
help='The phase to convert: test (default) or train') help='The phase to convert: test (default) or train')
args = parser.parse_args() args = parser.parse_args()
validate_arguments(args) validate_arguments(args)
convert(args.def_path, args.caffemodel, args.data_output_path, return convert(args.def_path, args.caffemodel, args.data_output_path,
args.code_output_path, args.phase) args.code_output_path, args.phase)
if __name__ == '__main__': if __name__ == '__main__':
main() ret = main()
sys.exit(ret)
a demo to show converting caffe models on 'imagenet' using caffe2fluid
---
# How to use
1. prepare python environment
2. download caffe model to "models.caffe/xxx" which contains "xxx.caffemodel" and "xxx.prototxt"
3. run the tool
eg: bash ./run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
#!/bin/env python
#function:
# a demo to show how to use the converted model genereated by caffe2fluid
#
#notes:
# only support imagenet data
import os
import sys
import inspect
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
def load_data(imgfile, shape):
h, w = shape[1:]
from PIL import Image
im = Image.open(imgfile)
# The storage order of the loaded image is W(widht),
# H(height), C(channel). PaddlePaddle requires
# the CHW order, so transpose them.
im = im.resize((w, h), Image.ANTIALIAS)
im = np.array(im).astype(np.float32)
im = im.transpose((2, 0, 1)) # CHW
im = im[(2, 1, 0), :, :] # BGR
# The mean to be subtracted from each image.
# By default, the per-channel ImageNet mean.
mean = np.array([104., 117., 124.], dtype=np.float32)
mean = mean.reshape([3, 1, 1])
im = im - mean
return im.reshape([1] + shape)
def build_model(net_file, net_name):
print('build model with net_file[%s] and net_name[%s]' %
(net_file, net_name))
net_path = os.path.dirname(net_file)
module_name = os.path.basename(net_file).rstrip('.py')
if net_path not in sys.path:
sys.path.insert(0, net_path)
try:
m = __import__(module_name, fromlist=[net_name])
MyNet = getattr(m, net_name)
except Exception as e:
print('failed to load module[%s]' % (module_name))
print(e)
return None
input_name = 'data'
input_shape = MyNet.input_shapes()[input_name]
images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
#label = fluid.layers.data(name='label', shape=[1], dtype='int64')
net = MyNet({input_name: images})
input_shape = MyNet.input_shapes()[input_name]
return net, input_shape
def dump_results(results, names, root):
if os.path.exists(root) is False:
os.path.mkdir(root)
for i in range(len(names)):
n = names[i]
res = results[i]
filename = os.path.join(root, n)
np.save(filename + '.npy', res)
def infer(net_file, net_name, model_file, imgfile, debug=False):
""" do inference using a model which consist 'xxx.py' and 'xxx.npy'
"""
#1, build model
net, input_shape = build_model(net_file, net_name)
prediction = net.get_output()
#2, load weights for this model
place = fluid.CPUPlace()
exe = fluid.Executor(place)
startup_program = fluid.default_startup_program()
exe.run(startup_program)
if model_file.find('.npy') > 0:
net.load(data_path=model_file, exe=exe, place=place)
else:
net.load(data_path=model_file, exe=exe)
#3, test this model
test_program = fluid.default_main_program().clone()
fetch_list_var = []
fetch_list_name = []
if debug is False:
fetch_list_var.append(prediction)
else:
for k, v in net.layers.items():
fetch_list_var.append(v)
fetch_list_name.append(k)
np_images = load_data(imgfile, input_shape)
results = exe.run(program=test_program,
feed={'image': np_images},
fetch_list=fetch_list_var)
if debug is True:
dump_path = 'results.layers'
dump_results(results, fetch_list_name, dump_path)
print('all results dumped to [%s]' % (dump_path))
else:
result = results[0]
print('predicted class:', np.argmax(result))
if __name__ == "__main__":
""" maybe more convenient to use 'run.sh' to call this tool
"""
net_file = 'models/resnet50/resnet50.py'
weight_file = 'models/resnet50/resnet50.npy'
imgfile = 'data/65.jpeg'
net_name = 'ResNet50'
argc = len(sys.argv)
if argc == 5:
net_file = sys.argv[1]
weight_file = sys.argv[2]
imgfile = sys.argv[3]
net_name = sys.argv[4]
elif argc > 1:
print('usage:')
print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' %
(sys.argv[0]))
print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
weight_file, imgfile, net_name))
sys.exit(1)
infer(net_file, net_name, weight_file, imgfile)
#!/bin/bash
#function:
# a tool used to:
# 1, convert a caffe model
# 2, do inference using this model
#
#usage:
# bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
#
#set -x
if [[ $# -lt 3 ]];then
echo "usage:"
echo " bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]"
echo " eg: bash $0 resnet50 ./models.caffe/resnet50 ./models/resnet50"
exit 1
else
model_name=$1
cf_model_path=$2
pd_model_path=$3
only_convert=$4
fi
proto_file=$cf_model_path/${model_name}.prototxt
caffemodel_file=$cf_model_path/${model_name}.caffemodel
weight_file=$pd_model_path/${model_name}.npy
net_file=$pd_model_path/${model_name}.py
if [[ ! -e $proto_file ]];then
echo "not found prototxt[$proto_file]"
exit 1
fi
if [[ ! -e $caffemodel_file ]];then
echo "not found caffemodel[$caffemodel_file]"
exit 1
fi
if [[ ! -e $pd_model_path ]];then
mkdir $pd_model_path
fi
PYTHON=`which cfpython`
if [[ -z $PYTHON ]];then
PYTHON=`which python`
fi
$PYTHON ../../convert.py \
$proto_file \
--caffemodel $caffemodel_file \
--data-output-path $weight_file\
--code-output-path $net_file
ret=$?
if [[ $ret -ne 0 ]];then
echo "failed to convert caffe model[$cf_model_path]"
exit $ret
else
echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]"
fi
if [[ -z $only_convert ]];then
PYTHON=`which pdpython`
if [[ -z $PYTHON ]];then
PYTHON=`which python`
fi
imgfile="data/65.jpeg"
net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
$PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
ret=$?
fi
exit $ret
a demo to show converting caffe model on 'mnist' using caffe2fluid
---
# How to use
1. prepare python environment
2. download caffe model to "models.caffe/lenet" which contains "lenet.caffemodel" and "lenet.prototxt"
3. run the tool
eg: bash ./run.sh lenet ./models.caffe/lenet ./models/lenet
...@@ -4,12 +4,12 @@ ...@@ -4,12 +4,12 @@
# demo to show how to use converted model using caffe2fluid # demo to show how to use converted model using caffe2fluid
# #
import sys
import os
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
from lenet import LeNet as MyNet
def test_model(exe, test_program, fetch_list, test_reader, feeder): def test_model(exe, test_program, fetch_list, test_reader, feeder):
acc_set = [] acc_set = []
...@@ -24,10 +24,15 @@ def test_model(exe, test_program, fetch_list, test_reader, feeder): ...@@ -24,10 +24,15 @@ def test_model(exe, test_program, fetch_list, test_reader, feeder):
return float(acc_val) return float(acc_val)
def main(model_path): def evaluate(net_file, model_file):
""" main """ main
""" """
print('load fluid model in %s' % (model_path)) #1, build model
net_path = os.path.dirname(net_file)
if net_path not in sys.path:
sys.path.insert(0, net_path)
from lenet import LeNet as MyNet
with_gpu = False with_gpu = False
paddle.init(use_gpu=with_gpu) paddle.init(use_gpu=with_gpu)
...@@ -45,10 +50,10 @@ def main(model_path): ...@@ -45,10 +50,10 @@ def main(model_path):
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
#2, load weights #2, load weights
if model_path.find('.npy') > 0: if model_file.find('.npy') > 0:
net.load(data_path=model_path, exe=exe, place=place) net.load(data_path=model_file, exe=exe, place=place)
else: else:
net.load(data_path=model_path, exe=exe) net.load(data_path=model_file, exe=exe)
#3, test this model #3, test this model
test_program = fluid.default_main_program().clone() test_program = fluid.default_main_program().clone()
...@@ -65,10 +70,17 @@ def main(model_path): ...@@ -65,10 +70,17 @@ def main(model_path):
if __name__ == "__main__": if __name__ == "__main__":
import sys net_file = 'models/lenet/lenet.py'
if len(sys.argv) == 2: weight_file = 'models/lenet/lenet.npy'
fluid_model_path = sys.argv[1]
else: argc = len(sys.argv)
fluid_model_path = './model.fluid' if argc == 3:
net_file = sys.argv[1]
main(fluid_model_path) weight_file = sys.argv[2]
elif argc > 1:
print('usage:')
print('\tpython %s [net_file] [weight_file]' % (sys.argv[0]))
print('\teg:python %s %s %s %s' % (sys.argv[0], net_file, weight_file))
sys.exit(1)
evaluate(net_file, weight_file)
#!/bin/bash
#function:
# a tool used to:
# 1, convert a caffe model
# 2, do inference using this model
#
#usage:
# bash run.sh lenet ./models.caffe/lenet ./models/lenet
#
#set -x
if [[ $# -lt 3 ]];then
echo "usage:"
echo " bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]"
echo " eg: bash $0 lenet ./models.caffe/lenet ./models/lenet"
exit 1
else
model_name=$1
cf_model_path=$2
pd_model_path=$3
no_eval=$4
fi
proto_file=$cf_model_path/${model_name}.prototxt
caffemodel_file=$cf_model_path/${model_name}.caffemodel
weight_file=$pd_model_path/${model_name}.npy
net_file=$pd_model_path/${model_name}.py
if [[ ! -e $proto_file ]];then
echo "not found prototxt[$proto_file]"
exit 1
fi
if [[ ! -e $caffemodel_file ]];then
echo "not found caffemodel[$caffemodel_file]"
exit 1
fi
if [[ ! -e $pd_model_path ]];then
mkdir $pd_model_path
fi
PYTHON=`which cfpython`
if [[ -z $PYTHON ]];then
PYTHON=`which python`
fi
$PYTHON ../../convert.py \
$proto_file \
--caffemodel $caffemodel_file \
--data-output-path $weight_file\
--code-output-path $net_file
ret=$?
if [[ $ret -ne 0 ]];then
echo "failed to convert caffe model[$cf_model_path]"
exit $ret
else
echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]"
fi
if [[ -z $only_convert ]];then
PYTHON=`which pdpython`
if [[ -z $PYTHON ]];then
PYTHON=`which python`
fi
net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
if [[ $net_name != "LeNet" ]];then
echo "only support LeNet"
exit 1
fi
$PYTHON ./evaluate.py $net_file $weight_file
ret=$?
fi
exit $ret
...@@ -54,7 +54,6 @@ def show_fallback_warning(): ...@@ -54,7 +54,6 @@ def show_fallback_warning():
WARNING: PyCaffe not found! WARNING: PyCaffe not found!
Falling back to a pure protocol buffer implementation. Falling back to a pure protocol buffer implementation.
* Conversions will be drastically slower. * Conversions will be drastically slower.
* This backend is UNTESTED!
------------------------------------------------------------ ------------------------------------------------------------
''' '''
......
...@@ -175,6 +175,7 @@ class GraphBuilder(object): ...@@ -175,6 +175,7 @@ class GraphBuilder(object):
kind = NodeKind.map_raw_kind(layer.type) kind = NodeKind.map_raw_kind(layer.type)
if kind is None: if kind is None:
raise KaffeError('Unknown layer type encountered: %s' % layer.type) raise KaffeError('Unknown layer type encountered: %s' % layer.type)
# We want to use the layer's top names (the "output" names), rather than the # We want to use the layer's top names (the "output" names), rather than the
# name attribute, which is more of readability thing than a functional one. # name attribute, which is more of readability thing than a functional one.
# Other layers will refer to a node by its "top name". # Other layers will refer to a node by its "top name".
...@@ -235,6 +236,7 @@ class GraphBuilder(object): ...@@ -235,6 +236,7 @@ class GraphBuilder(object):
node.add_parent(parent_node) node.add_parent(parent_node)
if len(layer.top) > 1: if len(layer.top) > 1:
raise KaffeError('Multiple top nodes are not supported.') raise KaffeError('Multiple top nodes are not supported.')
for output_name in layer.top: for output_name in layer.top:
if output_name == layer.name: if output_name == layer.name:
# Output is named the same as the node. No further action required. # Output is named the same as the node. No further action required.
......
...@@ -51,17 +51,79 @@ LAYER_DESCRIPTORS = { ...@@ -51,17 +51,79 @@ LAYER_DESCRIPTORS = {
'Threshold': shape_identity, 'Threshold': shape_identity,
} }
LAYER_TYPES = LAYER_DESCRIPTORS.keys() # layer types in 'V1LayerParameter'
# (v1layertype name, enum value, mapped to layer type)
v1_layertypes = [
('ABSVAL', 35),
('ACCURACY', 1),
('ARGMAX', 30),
('BNLL', 2),
('CONCAT', 3),
('CONVOLUTION', 4),
('DATA', 5),
('DECONVOLUTION', 39),
('DROPOUT', 6),
('ELTWISE', 25),
('EXP', 38),
('FLATTEN', 8),
('IM2COL', 11),
('INNERPRODUCT', 14),
('LRN', 15),
('MEMORYDATA', 29),
('MULTINOMIALLOGISTICLOSS', 16),
('MVN', 34),
('POOLING', 17),
('POWER', 26),
('RELU', 18),
('SIGMOID', 19),
('SIGMOIDCROSSENTROPYLOSS', 27),
('SILENCE', 36),
('SOFTMAX', 20),
('SPLIT', 22),
('SLICE', 33),
('TANH', 23),
('WINDOWDATA', 24),
('THRESHOLD', 31),
]
LAYER_TYPES = LAYER_DESCRIPTORS.keys()
LayerType = type('LayerType', (), {t: t for t in LAYER_TYPES}) LayerType = type('LayerType', (), {t: t for t in LAYER_TYPES})
#map the layer name in V1 to standard name
V1_LAYER_MAP = {'_not_init_': True}
def get_v1_layer_map():
global V1_LAYER_MAP
if '_not_init_' not in V1_LAYER_MAP:
return V1_LAYER_MAP
else:
del V1_LAYER_MAP['_not_init_']
name2layer = {}
for n in LAYER_TYPES:
name2layer[n.upper()] = n
for l in v1_layertypes:
n, v = l
if n in name2layer and v not in V1_LAYER_MAP:
V1_LAYER_MAP[v] = name2layer[n]
else:
raise KaffeError('not found v1 layer type %s' % n)
return V1_LAYER_MAP
class NodeKind(LayerType): class NodeKind(LayerType):
@staticmethod @staticmethod
def map_raw_kind(kind): def map_raw_kind(kind):
if kind in LAYER_TYPES: if kind in LAYER_TYPES:
return kind return kind
return None
v1_layers = get_v1_layer_map()
if kind in v1_layers:
return v1_layers[kind]
else:
return None
@staticmethod @staticmethod
def compute_output_shape(node): def compute_output_shape(node):
......
...@@ -27,6 +27,9 @@ def layer(op): ...@@ -27,6 +27,9 @@ def layer(op):
self.layers[name] = layer_output self.layers[name] = layer_output
# This output is now the input for the next layer. # This output is now the input for the next layer.
self.feed(layer_output) self.feed(layer_output)
#print('output shape of %s:' % (name))
#print layer_output.shape
# Return self for chained calls. # Return self for chained calls.
return self return self
...@@ -158,41 +161,64 @@ class Network(object): ...@@ -158,41 +161,64 @@ class Network(object):
output = fluid.layers.relu(x=input) output = fluid.layers.relu(x=input)
return output return output
@layer def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): #adjust the padding if needed
if padding is None: i_h, i_w = i_hw
padding = [0, 0] k_h, k_w = k_hw
s_h, s_w = s_hw
p_h, p_w = p_hw
def is_consistent(i, k, s, p):
o = i + 2 * p - k
if o % s == 0:
return True
else:
return False
real_p_h = 0
real_p_w = 0
if is_consistent(i_h, k_h, s_h, p_h) is False:
real_p_h = int(k_h / 2)
if is_consistent(i_w, k_w, s_w, p_w) is False:
real_p_w = int(k_w / 2)
return [real_p_h, real_p_w]
def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
# Get the number of channels in the input # Get the number of channels in the input
h_i, w_i = input.shape[2:] in_hw = input.shape[2:]
fluid = import_fluid() k_hw = [k_h, k_w]
output = fluid.layers.pool2d( s_hw = [s_h, s_w]
input=input,
pool_size=[k_h, k_w],
pool_stride=[s_h, s_w],
pool_padding=padding,
pool_type='max')
return output
@layer
def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
if padding is None: if padding is None:
padding = [0, 0] #fix bug about the difference between conv and pool
#more info: https://github.com/BVLC/caffe/issues/1318
padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
# Get the number of channels in the input
h_i, w_i = input.shape[2:]
fluid = import_fluid() fluid = import_fluid()
output = fluid.layers.pool2d( output = fluid.layers.pool2d(
input=input, input=input,
pool_size=[k_h, k_w], pool_size=k_hw,
pool_stride=[s_h, s_w], pool_stride=s_hw,
pool_padding=padding, pool_padding=padding,
pool_type='avg') pool_type=pool_type)
return output return output
@layer
def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)
@layer
def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)
@layer @layer
def lrn(self, input, radius, alpha, beta, name, bias=1.0): def lrn(self, input, radius, alpha, beta, name, bias=1.0):
raise Exception('lrn() not implemented yet') fluid = import_fluid()
output = fluid.layers.lrn(input=input, \
n=radius, k=bias, alpha=alpha, beta=beta, name=name)
return output
@layer @layer
def concat(self, inputs, axis, name): def concat(self, inputs, axis, name):
...@@ -228,7 +254,7 @@ class Network(object): ...@@ -228,7 +254,7 @@ class Network(object):
@layer @layer
def softmax(self, input, name): def softmax(self, input, name):
fluid = import_fluid() fluid = import_fluid()
output = fluid.layers.softmax(x=input, name=name) output = fluid.layers.softmax(input)
return output return output
@layer @layer
...@@ -256,5 +282,8 @@ class Network(object): ...@@ -256,5 +282,8 @@ class Network(object):
return output return output
@layer @layer
def dropout(self, input, keep_prob, name): def dropout(self, input, drop_prob, name, is_test=True):
raise Exception('dropout() not implemented yet') fluid = import_fluid()
output = fluid.layers.dropout(
input, dropout_prob=drop_prob, is_test=is_test, name=name)
return output
...@@ -132,8 +132,7 @@ class TensorFlowMapper(NodeMapper): ...@@ -132,8 +132,7 @@ class TensorFlowMapper(NodeMapper):
# just scales by alpha (as does Krizhevsky's paper). # just scales by alpha (as does Krizhevsky's paper).
# We'll account for that here. # We'll account for that here.
alpha = params.alpha / float(params.local_size) alpha = params.alpha / float(params.local_size)
return TensorFlowNode('lrn', return TensorFlowNode('lrn', params.local_size, alpha, params.beta)
int(params.local_size / 2), alpha, params.beta)
def map_concat(self, node): def map_concat(self, node):
return TensorFlowNode('concat', node.parameters.axis) return TensorFlowNode('concat', node.parameters.axis)
...@@ -191,22 +190,33 @@ class TensorFlowEmitter(object): ...@@ -191,22 +190,33 @@ class TensorFlowEmitter(object):
def emit_setup_def(self): def emit_setup_def(self):
return self.statement('def setup(self):') return self.statement('def setup(self):')
def emit_convert_def(self, input_nodes): def emit_shape_def(self, input_nodes):
def data_layer_def(name, shape, dtype=None): self.outdent()
if dtype is None: func_def = self.statement('@classmethod')
dtype = 'float32' func_def += self.statement('def input_shapes(cls):')
self.indent()
layer_var = name + '_layer' input_shapes = {}
shape = [str(s) for s in shape[1:]] for n in input_nodes:
layer_def = '%s = fluid.layers.data(name="%s", shape=[%s], dtype="%s")'\ name = n.name
% (layer_var, name, ','.join(shape), dtype) output_shape = n.output_shape
return layer_var, layer_def shape = [str(s) for s in output_shape[1:]]
input_shapes[name] = ', '.join(shape)
input_shapes = ['"%s": [%s]' % (n, l) for n, l in input_shapes.items()]
shape_str = ','.join(input_shapes)
func_def += self.statement('return {%s}' % (shape_str))
return '\n\n' + func_def
def emit_convert_def(self, input_nodes):
codes = [] codes = []
inputs = {} inputs = {}
codes.append('shapes = cls.input_shapes()')
for n in input_nodes: for n in input_nodes:
name = n.name name = n.name
layer_var, layer_def = data_layer_def(n.name, n.output_shape) layer_var = name + '_layer'
layer_def = '%s = fluid.layers.data(name="%s", shape=shapes["%s"],'\
' dtype="float32")' % (layer_var, name, name)
#layer_var, layer_def = data_layer_def(n.name, n.output_shape)
codes.append(layer_def) codes.append(layer_def)
inputs[name] = layer_var inputs[name] = layer_var
...@@ -229,7 +239,7 @@ class TensorFlowEmitter(object): ...@@ -229,7 +239,7 @@ class TensorFlowEmitter(object):
func_def += self.statement('import paddle.v2.fluid as fluid') func_def += self.statement('import paddle.v2.fluid as fluid')
for l in codes: for l in codes:
func_def += self.statement(l) func_def += self.statement(l)
return '\n\n' + func_def return '\n' + func_def
def emit_main_def(self, name): def emit_main_def(self, name):
if name is None: if name is None:
...@@ -273,6 +283,7 @@ class TensorFlowEmitter(object): ...@@ -273,6 +283,7 @@ class TensorFlowEmitter(object):
b += self.emit_node(node) b += self.emit_node(node)
blocks.append(b[:-1]) blocks.append(b[:-1])
s = s + '\n\n'.join(blocks) s = s + '\n\n'.join(blocks)
s += self.emit_shape_def(input_nodes)
s += self.emit_convert_def(input_nodes) s += self.emit_convert_def(input_nodes)
s += self.emit_main_def(name) s += self.emit_main_def(name)
return s return s
......
### Convert lenet model from caffe format into paddle format(fluid api)
### Howto
1, Prepare your caffepb.py
2, Download a lenet caffe-model
lenet_iter_10000.caffemodel
download address: https://github.com/ethereon/caffe-tensorflow/raw/master/examples/mnist/lenet_iter_10000.caffemodel
md5: cbec75c1c374b6c1981c4a1eb024ae01
lenet.prototxt
download address: https://raw.githubusercontent.com/BVLC/caffe/master/examples/mnist/lenet.prototxt
md5: 27384af843338ab90b00c8d1c81de7d5
2, Convert this model(make sure caffepb.py is ready in ../../proto)
convert to npy format
bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy
save to fluid format(optional)
bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy && python ./lenet.py ./lenet.npy ./fluid.model
4, Use this new model(paddle installed in this python)
use fluid format
python ./predict.py ./fluid.model
use npy format
python ./predict.py ./lenet.npy
#!/bin/bash
#function:
# convert a caffe model
# eg:
# bash ./convert.sh ./model.caffe/lenet.prototxt ./model.caffe/lenet.caffemodel lenet.py lenet.npy
if [[ $# -ne 4 ]];then
echo "usage:"
echo " bash $0 [PROTOTXT] [CAFFEMODEL] [PY_NAME] [WEIGHT_NAME]"
echo " eg: bash $0 lenet.prototxt lenet.caffemodel lenet.py lenet.npy"
exit 1
fi
WORK_ROOT=$(dirname `readlink -f ${BASH_SOURCE[0]}`)
if [[ -z $PYTHON ]];then
PYTHON=`which python`
fi
PROTOTXT=$1
CAFFEMODEL=$2
PY_NAME=$3
WEIGHT_NAME=$4
CONVERTER_PY="$WORK_ROOT/../../convert.py"
$PYTHON $CONVERTER_PY $PROTOTXT --caffemodel $CAFFEMODEL --code-output-path=$PY_NAME --data-output-path=$WEIGHT_NAME
ret=$?
if [[ $ret -eq 0 ]];then
echo "succeed to convert caffe model[$CAFFEMODEL, $PROTOTXT] to paddle model[$PY_NAME, $WEIGHT_NAME]"
else
echo "failed to convert caffe model[$CAFFEMODEL, $PROTOTXT]"
fi
exit $ret
### generated by caffe2fluid, your net is in class "LeNet" ###
import math
import os
import numpy as np
def import_fluid():
import paddle.v2.fluid as fluid
return fluid
def layer(op):
'''Decorator for composable network layers.'''
def layer_decorated(self, *args, **kwargs):
# Automatically set a name if not provided.
name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
# Figure out the layer inputs.
if len(self.terminals) == 0:
raise RuntimeError('No input variables found for layer %s.' % name)
elif len(self.terminals) == 1:
layer_input = self.terminals[0]
else:
layer_input = list(self.terminals)
# Perform the operation and get the output.
layer_output = op(self, layer_input, *args, **kwargs)
# Add to layer LUT.
self.layers[name] = layer_output
# This output is now the input for the next layer.
self.feed(layer_output)
# Return self for chained calls.
return self
return layer_decorated
class Network(object):
def __init__(self, inputs, trainable=True):
# The input nodes for this network
self.inputs = inputs
# The current list of terminal nodes
self.terminals = []
# Mapping from layer names to layers
self.layers = dict(inputs)
# If true, the resulting variables are set as trainable
self.trainable = trainable
# Switch variable for dropout
self.paddle_env = None
self.setup()
def setup(self):
'''Construct the network. '''
raise NotImplementedError('Must be implemented by the subclass.')
def load(self, data_path, exe=None, place=None, ignore_missing=False):
'''Load network weights.
data_path: The path to the numpy-serialized network weights
ignore_missing: If true, serialized weights for missing layers are ignored.
'''
fluid = import_fluid()
#load fluid mode directly
if os.path.isdir(data_path):
assert (exe is not None), \
'must provide a executor to load fluid model'
fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
return True
#load model from a npy file
if exe is None or place is None:
if self.paddle_env is None:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
self.paddle_env = {'place': place, 'exe': exe}
exe = exe.run(fluid.default_startup_program())
else:
place = self.paddle_env['place']
exe = self.paddle_env['exe']
data_dict = np.load(data_path).item()
for op_name in data_dict:
layer = self.layers[op_name]
for param_name, data in data_dict[op_name].iteritems():
try:
name = '%s_%s' % (op_name, param_name)
v = fluid.global_scope().find_var(name)
w = v.get_tensor()
w.set(data, place)
except ValueError:
if not ignore_missing:
raise
return True
def feed(self, *args):
'''Set the input(s) for the next operation by replacing the terminal nodes.
The arguments can be either layer names or the actual layers.
'''
assert len(args) != 0
self.terminals = []
for fed_layer in args:
if isinstance(fed_layer, basestring):
try:
fed_layer = self.layers[fed_layer]
except KeyError:
raise KeyError('Unknown layer name fed: %s' % fed_layer)
self.terminals.append(fed_layer)
return self
def get_output(self):
'''Returns the current network output.'''
return self.terminals[-1]
def get_unique_name(self, prefix):
'''Returns an index-suffixed unique name for the given prefix.
This is used for auto-generating layer names based on the type-prefix.
'''
ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
return '%s_%d' % (prefix, ident)
@layer
def conv(self,
input,
k_h,
k_w,
c_o,
s_h,
s_w,
name,
relu=True,
padding=None,
group=1,
biased=True):
if padding is None:
padding = [0, 0]
# Get the number of channels in the input
c_i, h_i, w_i = input.shape[1:]
# Verify that the grouping parameter is valid
assert c_i % group == 0
assert c_o % group == 0
fluid = import_fluid()
prefix = name + '_'
output = fluid.layers.conv2d(
input=input,
filter_size=[k_h, k_w],
num_filters=c_o,
stride=[s_h, s_w],
padding=padding,
groups=group,
param_attr=fluid.ParamAttr(name=prefix + "weights"),
bias_attr=fluid.ParamAttr(name=prefix + "biases"),
act="relu" if relu is True else None)
return output
@layer
def relu(self, input, name):
fluid = import_fluid()
output = fluid.layers.relu(x=input)
return output
@layer
def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
if padding is None:
padding = [0, 0]
# Get the number of channels in the input
h_i, w_i = input.shape[2:]
fluid = import_fluid()
output = fluid.layers.pool2d(
input=input,
pool_size=[k_h, k_w],
pool_stride=[s_h, s_w],
pool_padding=padding,
pool_type='max')
return output
@layer
def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
if padding is None:
padding = [0, 0]
# Get the number of channels in the input
h_i, w_i = input.shape[2:]
fluid = import_fluid()
output = fluid.layers.pool2d(
input=input,
pool_size=[k_h, k_w],
pool_stride=[s_h, s_w],
pool_padding=padding,
pool_type='avg')
return output
@layer
def lrn(self, input, radius, alpha, beta, name, bias=1.0):
raise Exception('lrn() not implemented yet')
@layer
def concat(self, inputs, axis, name):
fluid = import_fluid()
output = fluid.layers.concat(input=inputs, axis=axis)
return output
@layer
def add(self, inputs, name):
fluid = import_fluid()
output = inputs[0]
for i in inputs[1:]:
output = fluid.layers.elementwise_add(x=output, y=i)
return output
@layer
def fc(self, input, num_out, name, relu=True, act=None):
fluid = import_fluid()
if act is None:
act = 'relu' if relu is True else None
prefix = name + '_'
output = fluid.layers.fc(
name=name,
input=input,
size=num_out,
act=act,
param_attr=fluid.ParamAttr(name=prefix + 'weights'),
bias_attr=fluid.ParamAttr(name=prefix + 'biases'))
return output
@layer
def softmax(self, input, name):
fluid = import_fluid()
output = fluid.layers.softmax(x=input, name=name)
return output
@layer
def batch_normalization(self, input, name, scale_offset=True, relu=False):
# NOTE: Currently, only inference is supported
fluid = import_fluid()
prefix = name + '_'
param_attr = None if scale_offset is False else fluid.ParamAttr(
name=prefix + 'scale')
bias_attr = None if scale_offset is False else fluid.ParamAttr(
name=prefix + 'offset')
mean_name = prefix + 'mean'
variance_name = prefix + 'variance'
output = fluid.layers.batch_norm(
name=name,
input=input,
is_test=True,
param_attr=param_attr,
bias_attr=bias_attr,
moving_mean_name=mean_name,
moving_variance_name=variance_name,
epsilon=1e-5,
act='relu' if relu is True else None)
return output
@layer
def dropout(self, input, keep_prob, name):
raise Exception('dropout() not implemented yet')
class LeNet(Network):
def setup(self):
self.feed('data')
self.conv(5, 5, 20, 1, 1, relu=False, name='conv1')
self.max_pool(2, 2, 2, 2, name='pool1')
self.conv(5, 5, 50, 1, 1, relu=False, name='conv2')
self.max_pool(2, 2, 2, 2, name='pool2')
self.fc(500, name='ip1')
self.fc(10, relu=False, name='ip2')
self.softmax(name='prob')
@classmethod
def convert(cls, npy_model, fluid_path):
import paddle.v2.fluid as fluid
data_layer = fluid.layers.data(
name="data", shape=[1, 28, 28], dtype="float32")
feed_data = {"data": data_layer}
net = cls(feed_data)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
net.load(data_path=npy_model, exe=exe, place=place)
fluid.io.save_persistables(executor=exe, dirname=fluid_path)
if __name__ == "__main__":
#usage: python xxxnet.py xxx.npy ./model
import sys
npy_weight = sys.argv[1]
fluid_model = sys.argv[2]
LeNet.convert(npy_weight, fluid_model)
exit(0)
import os import os
import numpy as np
import time
import sys
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import reader import reader
...@@ -65,20 +68,44 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): ...@@ -65,20 +68,44 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt(input, class_dim, infer=False): def SE_ResNeXt(input, class_dim, infer=False, layers=50):
cardinality = 64 supported_layers = [50, 152]
reduction_ratio = 16 if layers not in supported_layers:
depth = [3, 8, 36, 3] print("supported layers are", supported_layers, "but input layer is",
num_filters = [128, 256, 512, 1024] layers)
exit()
if layers == 50:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
conv = conv_bn_layer( conv = conv_bn_layer(
input=input, num_filters=64, filter_size=3, stride=2, act='relu') input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = conv_bn_layer( conv = fluid.layers.pool2d(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu') input=conv,
conv = conv_bn_layer( pool_size=3,
input=conv, num_filters=128, filter_size=3, stride=1, act='relu') pool_stride=2,
conv = fluid.layers.pool2d( pool_padding=1,
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
conv = conv_bn_layer(
input=input, num_filters=64, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)): for block in range(len(depth)):
for i in range(depth[block]): for i in range(depth[block]):
...@@ -104,7 +131,10 @@ def train(learning_rate, ...@@ -104,7 +131,10 @@ def train(learning_rate,
num_passes, num_passes,
init_model=None, init_model=None,
model_save_dir='model', model_save_dir='model',
parallel=True): parallel=True,
use_nccl=True,
lr_strategy=None,
layers=50):
class_dim = 1000 class_dim = 1000
image_shape = [3, 224, 224] image_shape = [3, 224, 224]
...@@ -113,36 +143,52 @@ def train(learning_rate, ...@@ -113,36 +143,52 @@ def train(learning_rate,
if parallel: if parallel:
places = fluid.layers.get_places() places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
with pd.do(): with pd.do():
image_ = pd.read_input(image) image_ = pd.read_input(image)
label_ = pd.read_input(label) label_ = pd.read_input(label)
out = SE_ResNeXt(input=image_, class_dim=class_dim) out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label_) cost = fluid.layers.cross_entropy(input=out, label=label_)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
accuracy = fluid.layers.accuracy(input=out, label=label_) acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
pd.write_output(avg_cost) pd.write_output(avg_cost)
pd.write_output(accuracy) pd.write_output(acc_top1)
pd.write_output(acc_top5)
avg_cost, accuracy = pd() avg_cost, acc_top1, acc_top5 = pd()
avg_cost = fluid.layers.mean(x=avg_cost) avg_cost = fluid.layers.mean(x=avg_cost)
accuracy = fluid.layers.mean(x=accuracy) acc_top1 = fluid.layers.mean(x=acc_top1)
acc_top5 = fluid.layers.mean(x=acc_top5)
else: else:
out = SE_ResNeXt(input=image, class_dim=class_dim) out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label) cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
accuracy = fluid.layers.accuracy(input=out, label=label) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
if lr_strategy is None:
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
else:
bd = lr_strategy["bd"]
lr = lr_strategy["lr"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opts = optimizer.minimize(avg_cost) opts = optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program())
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program): with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program([avg_cost, accuracy]) inference_program = fluid.io.get_inference_program(
[avg_cost, acc_top1, acc_top5])
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -156,34 +202,86 @@ def train(learning_rate, ...@@ -156,34 +202,86 @@ def train(learning_rate,
feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
for pass_id in range(num_passes): for pass_id in range(num_passes):
train_info = [[], [], []]
test_info = [[], [], []]
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
loss = exe.run(fluid.default_main_program(), t1 = time.time()
feed=feeder.feed(data), loss, acc1, acc5 = exe.run(
fetch_list=[avg_cost]) fluid.default_main_program(),
print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id, feed=feeder.feed(data),
float(loss[0]))) fetch_list=[avg_cost, acc_top1, acc_top5])
t2 = time.time()
total_loss = 0.0 period = t2 - t1
total_acc = 0.0 train_info[0].append(loss[0])
total_batch = 0 train_info[1].append(acc1[0])
train_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
for data in test_reader(): for data in test_reader():
loss, acc = exe.run(inference_program, t1 = time.time()
feed=feeder.feed(data), loss, acc1, acc5 = exe.run(
fetch_list=[avg_cost, accuracy]) inference_program,
total_loss += float(loss) feed=feeder.feed(data),
total_acc += float(acc) fetch_list=[avg_cost, acc_top1, acc_top5])
total_batch += 1 t2 = time.time()
print("End pass {0}, test_loss {1}, test_acc {2}".format( period = t2 - t1
pass_id, total_loss / total_batch, total_acc / total_batch)) test_info[0].append(loss[0])
test_info[1].append(acc1[0])
test_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0},testbatch {1},loss {2}, \
acc1 {3},acc5 {4},time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
test_loss = np.array(test_info[0]).mean()
test_acc1 = np.array(test_info[1]).mean()
test_acc5 = np.array(test_info[2]).mean()
print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
test_loss {4}, test_acc1 {5}, test_acc5 {6}"
.format(pass_id, \
train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
test_acc5))
sys.stdout.flush()
model_path = os.path.join(model_save_dir, str(pass_id)) model_path = os.path.join(model_save_dir, str(pass_id))
fluid.io.save_inference_model(model_path, ['image'], [out], exe) if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
if __name__ == '__main__': if __name__ == '__main__':
epoch_points = [30, 60, 90]
total_images = 1281167
batch_size = 256
step = int(total_images / batch_size + 1)
bd = [e * step for e in epoch_points]
lr = [0.1, 0.01, 0.001, 0.0001]
lr_strategy = {"bd": bd, "lr": lr}
use_nccl = True
# layers: 50, 152
layers = 50
train( train(
learning_rate=0.1, learning_rate=0.1,
batch_size=8, batch_size=batch_size,
num_passes=100, num_passes=120,
init_model=None, init_model=None,
parallel=False) parallel=True,
use_nccl=True,
lr_strategy=lr_strategy,
layers=layers)
...@@ -22,7 +22,7 @@ class TrainTaskConfig(object): ...@@ -22,7 +22,7 @@ class TrainTaskConfig(object):
class InferTaskConfig(object): class InferTaskConfig(object):
use_gpu = False use_gpu = False
# the number of examples in one run for sequence generation. # the number of examples in one run for sequence generation.
batch_size = 10 batch_size = 1
# the parameters for beam search. # the parameters for beam search.
beam_size = 5 beam_size = 5
...@@ -92,7 +92,9 @@ encoder_input_data_names = ( ...@@ -92,7 +92,9 @@ encoder_input_data_names = (
"src_word", "src_word",
"src_pos", "src_pos",
"src_slf_attn_bias", "src_slf_attn_bias",
"src_data_shape", ) "src_data_shape",
"src_slf_attn_pre_softmax_shape",
"src_slf_attn_post_softmax_shape", )
# Names of all data layers in decoder listed in order. # Names of all data layers in decoder listed in order.
decoder_input_data_names = ( decoder_input_data_names = (
...@@ -101,6 +103,10 @@ decoder_input_data_names = ( ...@@ -101,6 +103,10 @@ decoder_input_data_names = (
"trg_slf_attn_bias", "trg_slf_attn_bias",
"trg_src_attn_bias", "trg_src_attn_bias",
"trg_data_shape", "trg_data_shape",
"trg_slf_attn_pre_softmax_shape",
"trg_slf_attn_post_softmax_shape",
"trg_src_attn_pre_softmax_shape",
"trg_src_attn_post_softmax_shape",
"enc_output", ) "enc_output", )
# Names of label related data layers listed in order. # Names of label related data layers listed in order.
......
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import model import model
...@@ -27,11 +27,19 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -27,11 +27,19 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
is_target=False, is_target=False,
return_pos=True, return_pos=True,
return_attn_bias=True, return_attn_bias=True,
return_max_len=True) return_max_len=False)
enc_in_data = enc_in_data[:-1] + [ # Append the data shape input to reshape the output of embedding layer.
enc_in_data = enc_in_data + [
np.array( np.array(
[batch_size, enc_in_data[-1], d_model], dtype="int32") [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
] # Append the data shape input. ]
# Append the shape inputs to reshape before and after softmax in encoder
# self attention.
enc_in_data = enc_in_data + [
np.array(
[-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array(
enc_in_data[2].shape, dtype="int32")
]
enc_output = exe.run(encoder, enc_output = exe.run(encoder,
feed=dict(zip(enc_in_names, enc_in_data)), feed=dict(zip(enc_in_names, enc_in_data)),
fetch_list=enc_out_names)[0] fetch_list=enc_out_names)[0]
...@@ -73,8 +81,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -73,8 +81,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
trg_words = np.array( trg_words = np.array(
[[bos_idx]] * batch_size * beam_size, dtype="int64") [[bos_idx]] * batch_size * beam_size, dtype="int64")
trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64") trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[-1][ src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[
1], enc_in_data[-2], 1 -1], enc_in_data[2], 1
# This is used to remove attention on subsequent words. # This is used to remove attention on subsequent words.
trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len, trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
trg_max_len)) trg_max_len))
...@@ -89,19 +97,38 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -89,19 +97,38 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
-1, src_slf_attn_bias.shape[1], trg_max_len, -1, src_slf_attn_bias.shape[1], trg_max_len,
src_slf_attn_bias.shape[-1] src_slf_attn_bias.shape[-1]
]) ])
# Append the shape input to reshape the output of embedding layer.
trg_data_shape = np.array( trg_data_shape = np.array(
[batch_size * beam_size, trg_max_len, d_model], dtype="int32") [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
# Append the shape inputs to reshape before and after softmax in
# decoder self attention.
trg_slf_attn_pre_softmax_shape = np.array(
[-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
trg_slf_attn_post_softmax_shape = np.array(
trg_slf_attn_bias.shape, dtype="int32")
# Append the shape inputs to reshape before and after softmax in
# encoder-decoder attention.
trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32")
enc_output = np.tile( enc_output = np.tile(
enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape( enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
[-1, enc_output.shape[-2], enc_output.shape[-1]]) [-1, enc_output.shape[-2], enc_output.shape[-1]])
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
trg_src_attn_post_softmax_shape, enc_output
def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map): def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
""" """
Update the input data of decoder mainly by slicing from the previous Update the input data of decoder mainly by slicing from the previous
input data and dropping the finished instance beams. input data and dropping the finished instance beams.
""" """
trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = dec_in_data trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
trg_src_attn_post_softmax_shape, enc_output = dec_in_data
trg_cur_len = trg_slf_attn_bias.shape[-1] + 1 trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
trg_words = np.array( trg_words = np.array(
[ [
...@@ -129,11 +156,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -129,11 +156,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
trg_src_attn_bias = np.tile(trg_src_attn_bias[ trg_src_attn_bias = np.tile(trg_src_attn_bias[
active_beams_indice, :, ::trg_src_attn_bias.shape[2], :], active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
[1, 1, trg_cur_len, 1]) [1, 1, trg_cur_len, 1])
# Append the shape input to reshape the output of embedding layer.
trg_data_shape = np.array( trg_data_shape = np.array(
[len(active_beams) * beam_size, trg_cur_len, d_model], [len(active_beams) * beam_size, trg_cur_len, d_model],
dtype="int32") dtype="int32")
# Append the shape inputs to reshape before and after softmax in
# decoder self attention.
trg_slf_attn_pre_softmax_shape = np.array(
[-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
trg_slf_attn_post_softmax_shape = np.array(
trg_slf_attn_bias.shape, dtype="int32")
# Append the shape inputs to reshape before and after softmax in
# encoder-decoder attention.
trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32")
enc_output = enc_output[active_beams_indice, :, :] enc_output = enc_output[active_beams_indice, :, :]
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
trg_src_attn_post_softmax_shape, enc_output
dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data, dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
enc_output) enc_output)
......
...@@ -29,7 +29,9 @@ def multi_head_attention(queries, ...@@ -29,7 +29,9 @@ def multi_head_attention(queries,
d_value, d_value,
d_model, d_model,
n_head=1, n_head=1,
dropout_rate=0.): dropout_rate=0.,
pre_softmax_shape=None,
post_softmax_shape=None):
""" """
Multi-Head Attention. Note that attn_bias is added to the logit before Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activiation to mask certain selected positions so that computing softmax activiation to mask certain selected positions so that
...@@ -109,21 +111,16 @@ def multi_head_attention(queries, ...@@ -109,21 +111,16 @@ def multi_head_attention(queries,
""" """
Scaled Dot-Product Attention Scaled Dot-Product Attention
""" """
# FIXME(guosheng): Remove __softmax when softmax_op supporting high
# rank tensors. softmax_op only supports 2D tensor currently.
# Otherwise, add extra input data to reshape.
def __softmax(x, eps=1e-9):
exp_out = layers.exp(x=x)
sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
scaled_q = layers.scale(x=q, scale=d_model**-0.5) scaled_q = layers.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax( weights = layers.reshape(
layers.elementwise_add( x=layers.elementwise_add(
x=product, y=attn_bias) if attn_bias else product) x=product, y=attn_bias) if attn_bias else product,
# weights = __softmax(product) shape=[-1, product.shape[-1]],
actual_shape=pre_softmax_shape,
act="softmax")
weights = layers.reshape(
x=weights, shape=product.shape, actual_shape=post_softmax_shape)
if dropout_rate: if dropout_rate:
weights = layers.dropout( weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False) weights, dropout_prob=dropout_rate, is_test=False)
...@@ -248,7 +245,9 @@ def encoder_layer(enc_input, ...@@ -248,7 +245,9 @@ def encoder_layer(enc_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate=0.): dropout_rate=0.,
pre_softmax_shape=None,
post_softmax_shape=None):
"""The encoder layers that can be stacked to form a deep encoder. """The encoder layers that can be stacked to form a deep encoder.
This module consits of a multi-head (self) attention followed by This module consits of a multi-head (self) attention followed by
...@@ -256,9 +255,9 @@ def encoder_layer(enc_input, ...@@ -256,9 +255,9 @@ def encoder_layer(enc_input,
with the post_process_layer to add residual connection, layer normalization with the post_process_layer to add residual connection, layer normalization
and droput. and droput.
""" """
attn_output = multi_head_attention(enc_input, enc_input, enc_input, attn_output = multi_head_attention(
attn_bias, d_key, d_value, d_model, enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model,
n_head, dropout_rate) n_head, dropout_rate, pre_softmax_shape, post_softmax_shape)
attn_output = post_process_layer(enc_input, attn_output, "dan", attn_output = post_process_layer(enc_input, attn_output, "dan",
dropout_rate) dropout_rate)
ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
...@@ -273,7 +272,9 @@ def encoder(enc_input, ...@@ -273,7 +272,9 @@ def encoder(enc_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate=0.): dropout_rate=0.,
pre_softmax_shape=None,
post_softmax_shape=None):
""" """
The encoder is composed of a stack of identical layers returned by calling The encoder is composed of a stack of identical layers returned by calling
encoder_layer. encoder_layer.
...@@ -287,7 +288,9 @@ def encoder(enc_input, ...@@ -287,7 +288,9 @@ def encoder(enc_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate,
pre_softmax_shape,
post_softmax_shape, )
enc_input = enc_output enc_input = enc_output
return enc_output return enc_output
...@@ -301,7 +304,11 @@ def decoder_layer(dec_input, ...@@ -301,7 +304,11 @@ def decoder_layer(dec_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate=0.): dropout_rate=0.,
slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None):
""" The layer to be stacked in decoder part. """ The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except The structure of this module is similar to that in the encoder part except
...@@ -316,7 +323,9 @@ def decoder_layer(dec_input, ...@@ -316,7 +323,9 @@ def decoder_layer(dec_input,
d_value, d_value,
d_model, d_model,
n_head, n_head,
dropout_rate, ) dropout_rate,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, )
slf_attn_output = post_process_layer( slf_attn_output = post_process_layer(
dec_input, dec_input,
slf_attn_output, slf_attn_output,
...@@ -331,7 +340,9 @@ def decoder_layer(dec_input, ...@@ -331,7 +340,9 @@ def decoder_layer(dec_input,
d_value, d_value,
d_model, d_model,
n_head, n_head,
dropout_rate, ) dropout_rate,
src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, )
enc_attn_output = post_process_layer( enc_attn_output = post_process_layer(
slf_attn_output, slf_attn_output,
enc_attn_output, enc_attn_output,
...@@ -359,7 +370,11 @@ def decoder(dec_input, ...@@ -359,7 +370,11 @@ def decoder(dec_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate=0.): dropout_rate=0.,
slf_attn_pre_softmax_shape=None,
slf_attn_post_softmax_shape=None,
src_attn_pre_softmax_shape=None,
src_attn_post_softmax_shape=None):
""" """
The decoder is composed of a stack of identical decoder_layer layers. The decoder is composed of a stack of identical decoder_layer layers.
""" """
...@@ -374,7 +389,11 @@ def decoder(dec_input, ...@@ -374,7 +389,11 @@ def decoder(dec_input,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, )
dec_input = dec_output dec_input = dec_output
return dec_output return dec_output
...@@ -383,11 +402,13 @@ def make_inputs(input_data_names, ...@@ -383,11 +402,13 @@ def make_inputs(input_data_names,
n_head, n_head,
d_model, d_model,
max_length, max_length,
is_pos=True, is_pos,
slf_attn_bias_flag=True, slf_attn_bias_flag,
src_attn_bias_flag=True, src_attn_bias_flag,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=True): data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True):
""" """
Define the input data layers for the transformer model. Define the input data layers for the transformer model.
""" """
...@@ -425,7 +446,8 @@ def make_inputs(input_data_names, ...@@ -425,7 +446,8 @@ def make_inputs(input_data_names,
append_batch_size=False) append_batch_size=False)
input_layers += [slf_attn_bias] input_layers += [slf_attn_bias]
if src_attn_bias_flag: if src_attn_bias_flag:
# This input is used to remove attention weights on paddings. # This input is used to remove attention weights on paddings. It's used
# in encoder-decoder attention.
# The actual data shape of slf_attn_bias_flag is: # The actual data shape of slf_attn_bias_flag is:
# [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch] # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
src_attn_bias = layers.data( src_attn_bias = layers.data(
...@@ -435,13 +457,41 @@ def make_inputs(input_data_names, ...@@ -435,13 +457,41 @@ def make_inputs(input_data_names,
append_batch_size=False) append_batch_size=False)
input_layers += [src_attn_bias] input_layers += [src_attn_bias]
if data_shape_flag: if data_shape_flag:
# This input is used to reshape. # This input is used to reshape the output of embedding layer.
data_shape = layers.data( data_shape = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[3], shape=[3],
dtype="int32", dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [data_shape] input_layers += [data_shape]
if slf_attn_shape_flag:
# This shape input is used to reshape before softmax in self attention.
slf_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[2],
dtype="int32",
append_batch_size=False)
input_layers += [slf_attn_pre_softmax_shape]
# This shape input is used to reshape after softmax in self attention.
slf_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[4],
dtype="int32",
append_batch_size=False)
input_layers += [slf_attn_post_softmax_shape]
if src_attn_shape_flag:
src_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[2],
dtype="int32",
append_batch_size=False)
input_layers += [src_attn_pre_softmax_shape]
src_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[4],
dtype="int32",
append_batch_size=False)
input_layers += [src_attn_post_softmax_shape]
if enc_output_flag: if enc_output_flag:
# This input is used in independent decoder program for inference. # This input is used in independent decoder program for inference.
# The actual data shape of slf_attn_bias_flag is: # The actual data shape of slf_attn_bias_flag is:
...@@ -452,6 +502,7 @@ def make_inputs(input_data_names, ...@@ -452,6 +502,7 @@ def make_inputs(input_data_names,
dtype="float32", dtype="float32",
append_batch_size=False) append_batch_size=False)
input_layers += [enc_output] input_layers += [enc_output]
return input_layers return input_layers
...@@ -469,8 +520,18 @@ def transformer( ...@@ -469,8 +520,18 @@ def transformer(
src_pad_idx, src_pad_idx,
trg_pad_idx, trg_pad_idx,
pos_pad_idx, ): pos_pad_idx, ):
enc_inputs = make_inputs(encoder_input_data_names, n_head, d_model, enc_inputs = make_inputs(
max_length, True, True, False) encoder_input_data_names,
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=False)
enc_output = wrap_encoder( enc_output = wrap_encoder(
src_vocab_size, src_vocab_size,
...@@ -486,8 +547,18 @@ def transformer( ...@@ -486,8 +547,18 @@ def transformer(
pos_pad_idx, pos_pad_idx,
enc_inputs, ) enc_inputs, )
dec_inputs = make_inputs(decoder_input_data_names, n_head, d_model, dec_inputs = make_inputs(
max_length, True, True, True) decoder_input_data_names,
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=True,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True)
predict = wrap_decoder( predict = wrap_decoder(
trg_vocab_size, trg_vocab_size,
...@@ -506,9 +577,19 @@ def transformer( ...@@ -506,9 +577,19 @@ def transformer(
# Padding index do not contribute to the total loss. The weights is used to # Padding index do not contribute to the total loss. The weights is used to
# cancel padding index in calculating the loss. # cancel padding index in calculating the loss.
gold, weights = make_inputs(label_data_names, n_head, d_model, max_length, gold, weights = make_inputs(
False, False, False, False, False) label_data_names,
cost = layers.cross_entropy(input=predict, label=gold) n_head,
d_model,
max_length,
is_pos=False,
slf_attn_bias_flag=False,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=False,
slf_attn_shape_flag=False,
src_attn_shape_flag=False)
cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
weighted_cost = cost * weights weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost), predict return layers.reduce_sum(weighted_cost), predict
...@@ -530,12 +611,24 @@ def wrap_encoder(src_vocab_size, ...@@ -530,12 +611,24 @@ def wrap_encoder(src_vocab_size,
""" """
if enc_inputs is None: if enc_inputs is None:
# This is used to implement independent encoder program in inference. # This is used to implement independent encoder program in inference.
src_word, src_pos, src_slf_attn_bias, src_data_shape = make_inputs( src_word, src_pos, src_slf_attn_bias, src_data_shape, \
encoder_input_data_names, n_head, d_model, max_length, True, True, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
False) make_inputs(
encoder_input_data_names,
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=False,
enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=False)
else: else:
src_word, src_pos, src_slf_attn_bias, src_data_shape = enc_inputs src_word, src_pos, src_slf_attn_bias, src_data_shape, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
enc_inputs
enc_input = prepare_encoder( enc_input = prepare_encoder(
src_word, src_word,
src_pos, src_pos,
...@@ -555,7 +648,9 @@ def wrap_encoder(src_vocab_size, ...@@ -555,7 +648,9 @@ def wrap_encoder(src_vocab_size,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape, )
return enc_output return enc_output
...@@ -577,11 +672,26 @@ def wrap_decoder(trg_vocab_size, ...@@ -577,11 +672,26 @@ def wrap_decoder(trg_vocab_size,
""" """
if dec_inputs is None: if dec_inputs is None:
# This is used to implement independent decoder program in inference. # This is used to implement independent decoder program in inference.
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = make_inputs( trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
decoder_input_data_names, n_head, d_model, max_length, True, True, trg_data_shape, slf_attn_pre_softmax_shape, \
True, True) slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
src_attn_post_softmax_shape, enc_output = make_inputs(
decoder_input_data_names,
n_head,
d_model,
max_length,
is_pos=True,
slf_attn_bias_flag=True,
src_attn_bias_flag=True,
enc_output_flag=True,
data_shape_flag=True,
slf_attn_shape_flag=True,
src_attn_shape_flag=True)
else: else:
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape = dec_inputs trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_data_shape, slf_attn_pre_softmax_shape, \
slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
src_attn_post_softmax_shape = dec_inputs
dec_input = prepare_decoder( dec_input = prepare_decoder(
trg_word, trg_word,
...@@ -604,13 +714,17 @@ def wrap_decoder(trg_vocab_size, ...@@ -604,13 +714,17 @@ def wrap_decoder(trg_vocab_size,
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate,
slf_attn_pre_softmax_shape,
slf_attn_post_softmax_shape,
src_attn_pre_softmax_shape,
src_attn_post_softmax_shape, )
# Return logits for training and probs for inference.
predict = layers.reshape( predict = layers.reshape(
x=layers.fc(input=dec_output, x=layers.fc(input=dec_output,
size=trg_vocab_size, size=trg_vocab_size,
bias_attr=False, bias_attr=False,
num_flatten_dims=2), num_flatten_dims=2),
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
act="softmax") act="softmax" if dec_inputs is None else None)
return predict return predict
import os import os
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from model import transformer, position_encoding_init from model import transformer, position_encoding_init
...@@ -66,16 +66,35 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -66,16 +66,35 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
[inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32") [1, 1, trg_max_len, 1]).astype("float32")
# These shape tensors are used in reshape_op.
src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array(
src_slf_attn_bias.shape, dtype="int32")
trg_slf_attn_pre_softmax_shape = np.array(
[-1, trg_slf_attn_bias.shape[-1]], dtype="int32")
trg_slf_attn_post_softmax_shape = np.array(
trg_slf_attn_bias.shape, dtype="int32")
trg_src_attn_pre_softmax_shape = np.array(
[-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32")
lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head, lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
False, False, False, False) False, False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
input_dict = dict( input_dict = dict(
zip(input_data_names, [ zip(input_data_names, [
src_word, src_pos, src_slf_attn_bias, src_data_shape, trg_word, src_word, src_pos, src_slf_attn_bias, src_data_shape,
trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape,
lbl_word, lbl_weight trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
trg_data_shape, trg_slf_attn_pre_softmax_shape,
trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
])) ]))
return input_dict return input_dict
...@@ -122,6 +141,10 @@ def main(): ...@@ -122,6 +141,10 @@ def main():
def test(exe): def test(exe):
test_costs = [] test_costs = []
for batch_id, data in enumerate(val_data()): for batch_id, data in enumerate(val_data()):
if len(data) != TrainTaskConfig.batch_size:
# Since we use the sum cost, keep comparable cost by fixing the
# batch size. Remove this if the cost is mean.
continue
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx, label_data_names, ModelHyperParams.src_pad_idx,
......
from PIL import Image from PIL import Image, ImageEnhance
import numpy as np import numpy as np
import random import random
import math import math
...@@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): ...@@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
sample_img = img[ymin:ymax, xmin:xmax] sample_img = img[ymin:ymax, xmin:xmax]
sample_labels = transform_labels(bbox_labels, sample_bbox) sample_labels = transform_labels(bbox_labels, sample_bbox)
return sample_img, sample_labels return sample_img, sample_labels
def random_brightness(img, settings):
prob = random.uniform(0, 1)
if prob < settings._brightness_prob:
delta = random.uniform(-settings._brightness_delta,
settings._brightness_delta) + 1
img = ImageEnhance.Brightness(img).enhance(delta)
return img
def random_contrast(img, settings):
prob = random.uniform(0, 1)
if prob < settings._contrast_prob:
delta = random.uniform(-settings._contrast_delta,
settings._contrast_delta) + 1
img = ImageEnhance.Contrast(img).enhance(delta)
return img
def random_saturation(img, settings):
prob = random.uniform(0, 1)
if prob < settings._saturation_prob:
delta = random.uniform(-settings._saturation_delta,
settings._saturation_delta) + 1
img = ImageEnhance.Color(img).enhance(delta)
return img
def random_hue(img, settings):
prob = random.uniform(0, 1)
if prob < settings._hue_prob:
delta = random.uniform(-settings._hue_delta, settings._hue_delta)
img_hsv = np.array(img.convert('HSV'))
img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
return img
def distort_image(img, settings):
prob = random.uniform(0, 1)
# Apply different distort order
if prob > 0.5:
img = random_brightness(img, settings)
img = random_contrast(img, settings)
img = random_saturation(img, settings)
img = random_hue(img, settings)
else:
img = random_brightness(img, settings)
img = random_saturation(img, settings)
img = random_hue(img, settings)
img = random_contrast(img, settings)
return img
def expand_image(img, bbox_labels, img_width, img_height, settings):
prob = random.uniform(0, 1)
if prob < settings._expand_prob:
expand_ratio = random.uniform(1, settings._expand_max_ratio)
if expand_ratio - 1 >= 0.01:
height = int(img_height * expand_ratio)
width = int(img_width * expand_ratio)
h_off = math.floor(random.uniform(0, height - img_height))
w_off = math.floor(random.uniform(0, width - img_width))
expand_bbox = bbox(-w_off / img_width, -h_off / img_height,
(width - w_off) / img_width,
(height - h_off) / img_height)
expand_img = np.ones((height, width, 3))
expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean))
expand_img = Image.fromarray(expand_img)
expand_img.paste(img, (int(w_off), int(h_off)))
bbox_labels = transform_labels(bbox_labels, expand_bbox)
return expand_img, bbox_labels
return img, bbox_labels
...@@ -22,17 +22,38 @@ import os ...@@ -22,17 +22,38 @@ import os
class Settings(object): class Settings(object):
def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value): def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value,
apply_distort, apply_expand):
self._data_dir = data_dir self._data_dir = data_dir
self._label_list = [] self._label_list = []
label_fpath = os.path.join(data_dir, label_file) label_fpath = os.path.join(data_dir, label_file)
for line in open(label_fpath): for line in open(label_fpath):
self._label_list.append(line.strip()) self._label_list.append(line.strip())
self._apply_distort = apply_distort
self._apply_expand = apply_expand
self._resize_height = resize_h self._resize_height = resize_h
self._resize_width = resize_w self._resize_width = resize_w
self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype( self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
'float32') 'float32')
self._expand_prob = 0.5
self._expand_max_ratio = 4
self._hue_prob = 0.5
self._hue_delta = 18
self._contrast_prob = 0.5
self._contrast_delta = 0.5
self._saturation_prob = 0.5
self._saturation_delta = 0.5
self._brightness_prob = 0.5
self._brightness_delta = 0.125
@property
def apply_distort(self):
return self._apply_expand
@property
def apply_distort(self):
return self._apply_distort
@property @property
def data_dir(self): def data_dir(self):
...@@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle):
img = Image.open(img_path) img = Image.open(img_path)
img_width, img_height = img.size img_width, img_height = img.size
img = np.array(img)
# layout: label | xmin | ymin | xmax | ymax | difficult # layout: label | xmin | ymin | xmax | ymax | difficult
if mode == 'train' or mode == 'test': if mode == 'train' or mode == 'test':
...@@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle):
sample_labels = bbox_labels sample_labels = bbox_labels
if mode == 'train': if mode == 'train':
if settings._apply_distort:
img = image_util.distort_image(img, settings)
if settings._apply_expand:
img, bbox_labels = image_util.expand_image(
img, bbox_labels, img_width, img_height,
settings)
batch_sampler = [] batch_sampler = []
# hard-code here # hard-code here
batch_sampler.append( batch_sampler.append(
...@@ -126,13 +152,14 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -126,13 +152,14 @@ def _reader_creator(settings, file_list, mode, shuffle):
sampled_bbox = image_util.generate_batch_samples( sampled_bbox = image_util.generate_batch_samples(
batch_sampler, bbox_labels, img_width, img_height) batch_sampler, bbox_labels, img_width, img_height)
img = np.array(img)
if len(sampled_bbox) > 0: if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox))) idx = int(random.uniform(0, len(sampled_bbox)))
img, sample_labels = image_util.crop_image( img, sample_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width, img, bbox_labels, sampled_bbox[idx], img_width,
img_height) img_height)
img = Image.fromarray(img) img = Image.fromarray(img)
img = img.resize((settings.resize_w, settings.resize_h), img = img.resize((settings.resize_w, settings.resize_h),
Image.ANTIALIAS) Image.ANTIALIAS)
img = np.array(img) img = np.array(img)
......
...@@ -12,8 +12,9 @@ import functools ...@@ -12,8 +12,9 @@ import functools
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('parallel', bool, True, "Whether use parallel training.") add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('use_gpu', bool, True, "Whether use GPU.") add_arg('parallel', bool, True, "Whether use parallel training.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
# yapf: disable # yapf: disable
...@@ -47,26 +48,24 @@ def train(args, ...@@ -47,26 +48,24 @@ def train(args,
locs, confs, box, box_var = mobile_net(image_, image_shape) locs, confs, box, box_var = mobile_net(image_, image_shape)
loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_,
box, box_var) box, box_var)
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45)
loss = fluid.layers.reduce_sum(loss)
pd.write_output(loss) pd.write_output(loss)
pd.write_output(locs) pd.write_output(nmsed_out)
pd.write_output(confs)
pd.write_output(box)
pd.write_output(box_var)
loss, locs, confs, box, box_var = pd() loss, nmsed_out = pd()
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.mean(loss)
else: else:
locs, confs, box, box_var = mobile_net(image, image_shape) locs, confs, box, box_var = mobile_net(image, image_shape)
nmsed_out = fluid.layers.detection_output( nmsed_out = fluid.layers.detection_output(
locs, mbox_confs, box, box_var, nms_threshold=0.45) locs, confs, box, box_var, nms_threshold=0.45)
loss = fluid.layers.ssd_loss(locs, mbox_confs, gt_box, gt_label, loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label,
box, box_var) box, box_var)
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.reduce_sum(loss)
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(test_program): with fluid.program_guard(test_program):
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45)
map_eval = fluid.evaluator.DetectionMAP( map_eval = fluid.evaluator.DetectionMAP(
nmsed_out, nmsed_out,
gt_label, gt_label,
...@@ -77,13 +76,10 @@ def train(args, ...@@ -77,13 +76,10 @@ def train(args,
evaluate_difficult=False, evaluate_difficult=False,
ap_version='11point') ap_version='11point')
optimizer = fluid.optimizer.Momentum( boundaries = [40000, 60000]
learning_rate=fluid.layers.exponential_decay( values = [0.001, 0.0005, 0.00025]
learning_rate=learning_rate, optimizer = fluid.optimizer.RMSProp(
decay_steps=40000, learning_rate=fluid.layers.piecewise_decay(boundaries, values),
decay_rate=0.1,
staircase=True),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.00005), ) regularization=fluid.regularizer.L2Decay(0.00005), )
optimizer.minimize(loss) optimizer.minimize(loss)
...@@ -92,7 +88,8 @@ def train(args, ...@@ -92,7 +88,8 @@ def train(args,
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
load_model.load_paddlev1_vars(place) load_model.load_and_set_vars(place)
#load_model.load_paddlev1_vars(place)
train_reader = paddle.batch( train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size) reader.train(data_args, train_file_list), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
...@@ -100,7 +97,6 @@ def train(args, ...@@ -100,7 +97,6 @@ def train(args,
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
place=place, feed_list=[image, gt_box, gt_label, difficult]) place=place, feed_list=[image, gt_box, gt_label, difficult])
#print 'test_program ', test_program
def test(pass_id): def test(pass_id):
_, accum_map = map_eval.get_map_var() _, accum_map = map_eval.get_map_var()
map_eval.reset(exe) map_eval.reset(exe)
...@@ -111,14 +107,14 @@ def train(args, ...@@ -111,14 +107,14 @@ def train(args,
fetch_list=[accum_map]) fetch_list=[accum_map])
print("Test {0}, map {1}".format(pass_id, test_map[0])) print("Test {0}, map {1}".format(pass_id, test_map[0]))
#print 'main_program ', fluid.default_main_program()
for pass_id in range(num_passes): for pass_id in range(num_passes):
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
loss_v = exe.run(fluid.default_main_program(), loss_v = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
print("Pass {0}, batch {1}, loss {2}" if batch_id % 20 == 0:
.format(pass_id, batch_id, loss_v[0])) print("Pass {0}, batch {1}, loss {2}"
.format(pass_id, batch_id, loss_v[0]))
test(pass_id) test(pass_id)
if pass_id % 10 == 0: if pass_id % 10 == 0:
...@@ -134,6 +130,8 @@ if __name__ == '__main__': ...@@ -134,6 +130,8 @@ if __name__ == '__main__':
data_args = reader.Settings( data_args = reader.Settings(
data_dir='./data', data_dir='./data',
label_file='label_list', label_file='label_list',
apply_distort=True,
apply_expand=True,
resize_h=300, resize_h=300,
resize_w=300, resize_w=300,
mean_value=[127.5, 127.5, 127.5]) mean_value=[127.5, 127.5, 127.5])
...@@ -142,5 +140,5 @@ if __name__ == '__main__': ...@@ -142,5 +140,5 @@ if __name__ == '__main__':
val_file_list='./data/test.txt', val_file_list='./data/test.txt',
data_args=data_args, data_args=data_args,
learning_rate=0.001, learning_rate=0.001,
batch_size=32, batch_size=args.batch_size,
num_passes=300) num_passes=300)
...@@ -187,15 +187,18 @@ def ctc_train_net(images, label, args, num_classes): ...@@ -187,15 +187,18 @@ def ctc_train_net(images, label, args, num_classes):
error_evaluator = fluid.evaluator.EditDistance( error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label) input=decoded_out, label=casted_label)
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(error_evaluator)
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=args.learning_rate, momentum=args.momentum) learning_rate=args.learning_rate, momentum=args.momentum)
_, params_grads = optimizer.minimize(sum_cost) _, params_grads = optimizer.minimize(sum_cost)
model_average = fluid.optimizer.ModelAverage(
params_grads,
args.average_window,
min_average_window=args.min_average_window,
max_average_window=args.max_average_window)
return sum_cost, error_evaluator, inference_program return sum_cost, error_evaluator, inference_program, model_average
def ctc_infer(images, num_classes): def ctc_infer(images, num_classes):
......
import os import os
import cv2 import cv2
import tarfile
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from os import path
from paddle.v2.image import load_image from paddle.v2.image import load_image
import paddle.v2 as paddle import paddle.v2 as paddle
NUM_CLASSES = 10784 NUM_CLASSES = 10784
DATA_SHAPE = [1, 48, 512] DATA_SHAPE = [1, 48, 512]
DATA_MD5 = "1de60d54d19632022144e4e58c2637b5"
DATA_URL = "http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c"
CACHE_DIR_NAME = "ctc_data"
SAVED_FILE_NAME = "data.tar.gz"
DATA_DIR_NAME = "data"
TRAIN_DATA_DIR_NAME = "train_images"
TEST_DATA_DIR_NAME = "test_images"
TRAIN_LIST_FILE_NAME = "train.list"
TEST_LIST_FILE_NAME = "test.list"
class DataGenerator(object): class DataGenerator(object):
def __init__(self): def __init__(self):
...@@ -102,25 +113,42 @@ class DataGenerator(object): ...@@ -102,25 +113,42 @@ class DataGenerator(object):
def num_classes(): def num_classes():
'''Get classes number of this dataset.
'''
return NUM_CLASSES return NUM_CLASSES
def data_shape(): def data_shape():
'''Get image shape of this dataset. It is a dummy shape for this dataset.
'''
return DATA_SHAPE return DATA_SHAPE
def train(batch_size): def train(batch_size):
generator = DataGenerator() generator = DataGenerator()
data_dir = download_data()
return generator.train_reader( return generator.train_reader(
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train_images/", path.join(data_dir, TRAIN_DATA_DIR_NAME),
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train.list", path.join(data_dir, TRAIN_LIST_FILE_NAME), batch_size)
batch_size)
def test(batch_size=1): def test(batch_size=1):
generator = DataGenerator() generator = DataGenerator()
data_dir = download_data()
return paddle.batch( return paddle.batch(
generator.test_reader( generator.test_reader(
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test_images/", path.join(data_dir, TRAIN_DATA_DIR_NAME),
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test.list" path.join(data_dir, TRAIN_LIST_FILE_NAME)), batch_size)
), batch_size)
def download_data():
'''Download train and test data.
'''
tar_file = paddle.dataset.common.download(
DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
if not path.isdir(data_dir):
t = tarfile.open(tar_file, "r:gz")
t.extractall(path=path.dirname(tar_file))
t.close()
return data_dir
...@@ -8,6 +8,7 @@ import functools ...@@ -8,6 +8,7 @@ import functools
import sys import sys
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_train_net from crnn_ctc_model import ctc_train_net
import time
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
...@@ -23,7 +24,10 @@ add_arg('momentum', float, 0.9, "Momentum.") ...@@ -23,7 +24,10 @@ add_arg('momentum', float, 0.9, "Momentum.")
add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.") add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.")
add_arg('device', int, 0, "Device id.'-1' means running on CPU" add_arg('device', int, 0, "Device id.'-1' means running on CPU"
"while '0' means GPU-0.") "while '0' means GPU-0.")
add_arg('parallel', bool, True, "Whether use parallel training.") add_arg('min_average_window', int, 10000, "Min average window.")
add_arg('max_average_window', int, 15625, "Max average window.")
add_arg('average_window', float, 0.15, "Average window.")
add_arg('parallel', bool, False, "Whether use parallel training.")
# yapf: disable # yapf: disable
def load_parameter(place): def load_parameter(place):
...@@ -40,7 +44,7 @@ def train(args, data_reader=dummy_reader): ...@@ -40,7 +44,7 @@ def train(args, data_reader=dummy_reader):
# define network # define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1) label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
sum_cost, error_evaluator, inference_program = ctc_train_net(images, label, args, num_classes) sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(images, label, args, num_classes)
# data reader # data reader
train_reader = data_reader.train(args.batch_size) train_reader = data_reader.train(args.batch_size)
...@@ -66,21 +70,24 @@ def train(args, data_reader=dummy_reader): ...@@ -66,21 +70,24 @@ def train(args, data_reader=dummy_reader):
fetch_list=[sum_cost] + error_evaluator.metrics) fetch_list=[sum_cost] + error_evaluator.metrics)
total_loss += batch_loss[0] total_loss += batch_loss[0]
total_seq_error += batch_seq_error[0] total_seq_error += batch_seq_error[0]
if batch_id % 10 == 1: if batch_id % 100 == 1:
print '.', print '.',
sys.stdout.flush() sys.stdout.flush()
if batch_id % args.log_period == 1: if batch_id % args.log_period == 1:
print "\nPass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % ( print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
time.time(),
pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size)) pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size))
sys.stdout.flush() sys.stdout.flush()
batch_id += 1 batch_id += 1
error_evaluator.reset(exe) with model_average.apply(exe):
for data in test_reader(): error_evaluator.reset(exe)
exe.run(inference_program, feed=get_feeder_data(data, place)) for data in test_reader():
_, test_seq_error = error_evaluator.eval(exe) exe.run(inference_program, feed=get_feeder_data(data, place))
print "\nEnd pass[%d]; Test seq error: %s.\n" % ( _, test_seq_error = error_evaluator.eval(exe)
pass_id, str(test_seq_error[0]))
print "\nEnd pass[%d]; Test seq error: %s.\n" % (
pass_id, str(test_seq_error[0]))
def main(): def main():
args = parser.parse_args() args = parser.parse_args()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册