diff --git a/fluid/DeepASR/data_utils/async_data_reader.py b/fluid/DeepASR/data_utils/async_data_reader.py index 1515b299d4357eac16892dceda6e4f05bf1fc045..731c55de71e8d4b7db156f1ae72172c36eb1be7a 100644 --- a/fluid/DeepASR/data_utils/async_data_reader.py +++ b/fluid/DeepASR/data_utils/async_data_reader.py @@ -15,9 +15,7 @@ from multiprocessing import Manager, Process import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm import data_utils.augmentor.trans_add_delta as trans_add_delta from data_utils.util import suppress_complaints, suppress_signal -from data_utils.util import SharedNDArray, SharedMemoryPoolManager -from data_utils.util import DaemonProcessGroup, batch_to_ndarray -from data_utils.util import CriticalException, ForceExitWrapper, EpochEndSignal +from data_utils.util import CriticalException, ForceExitWrapper class SampleInfo(object): @@ -32,11 +30,12 @@ class SampleInfo(object): label_bin_path (str): File containing the label data. label_size (int): Byte count of the sample's label data. label_frame_num (int): Label number of the sample. + sample_name (str): Key of the sample """ def __init__(self, feature_bin_path, feature_start, feature_size, feature_frame_num, feature_dim, label_bin_path, label_start, - label_size, label_frame_num): + label_size, label_frame_num, sample_name): self.feature_bin_path = feature_bin_path self.feature_start = feature_start self.feature_size = feature_size @@ -47,6 +46,7 @@ class SampleInfo(object): self.label_start = label_start self.label_size = label_size self.label_frame_num = label_frame_num + self.sample_name = sample_name class SampleInfoBucket(object): @@ -69,8 +69,8 @@ class SampleInfoBucket(object): split_sentence_threshold(int): Sentence whose length larger than the value will trigger split operation. split_sub_sentence_len(int): sub-sentence length is equal to - (split_sub_sentence_len + \ - rand() % split_perturb). + (split_sub_sentence_len + + rand() % split_perturb). """ def __init__(self, @@ -104,24 +104,33 @@ class SampleInfoBucket(object): feature_bin_path = self._feature_bin_paths[block_idx] feature_desc_path = self._feature_desc_paths[block_idx] - label_desc_lines = open(label_desc_path).readlines() feature_desc_lines = open(feature_desc_path).readlines() - sample_num = int(label_desc_lines[0].split()[1]) - assert sample_num == int(feature_desc_lines[0].split()[1]) + label_desc_lines = [] + if label_desc_path != "": + label_desc_lines = open(label_desc_path).readlines() + sample_num = int(feature_desc_lines[0].split()[1]) + + if label_desc_path != "": + assert sample_num == int(label_desc_lines[0].split()[1]) for i in xrange(sample_num): feature_desc_split = feature_desc_lines[i + 1].split() + sample_name = feature_desc_split[0] feature_start = int(feature_desc_split[2]) feature_size = int(feature_desc_split[3]) feature_frame_num = int(feature_desc_split[4]) feature_dim = int(feature_desc_split[5]) - label_desc_split = label_desc_lines[i + 1].split() - label_start = int(label_desc_split[2]) - label_size = int(label_desc_split[3]) - label_frame_num = int(label_desc_split[4]) - assert feature_frame_num == label_frame_num + label_start = -1 + label_size = -1 + label_frame_num = feature_frame_num + if label_desc_path != "": + label_desc_split = label_desc_lines[i + 1].split() + label_start = int(label_desc_split[2]) + label_size = int(label_desc_split[3]) + label_frame_num = int(label_desc_split[4]) + assert feature_frame_num == label_frame_num if self._split_sentence_threshold == -1 or \ self._split_perturb == -1 or \ @@ -131,7 +140,7 @@ class SampleInfoBucket(object): SampleInfo(feature_bin_path, feature_start, feature_size, feature_frame_num, feature_dim, label_bin_path, label_start, label_size, - label_frame_num)) + label_frame_num, sample_name)) #split sentence else: cur_frame_pos = 0 @@ -152,16 +161,19 @@ class SampleInfoBucket(object): * feature_dim * 4, cur_frame_len * feature_dim * 4, cur_frame_len, feature_dim, label_bin_path, label_start + cur_frame_pos * 4, cur_frame_len * - 4, cur_frame_len)) + 4, cur_frame_len, sample_name)) remain_frame_num -= cur_frame_len cur_frame_pos += cur_frame_len if remain_frame_num <= 0: break - return sample_info_list +class EpochEndSignal(): + pass + + class AsyncDataReader(object): """DataReader provides basic audio sample preprocessing pipeline including data loading and data augmentation. @@ -190,7 +202,7 @@ class AsyncDataReader(object): def __init__(self, feature_file_list, - label_file_list, + label_file_list="", drop_frame_len=512, proc_num=10, sample_buffer_size=1024, @@ -213,27 +225,30 @@ class AsyncDataReader(object): self._sample_info_buffer_size = sample_info_buffer_size self._batch_buffer_size = batch_buffer_size self._proc_num = proc_num - if self._proc_num <= 2: - raise ValueError("Value of `proc_num` should be greater than 2.") - self._sample_proc_num = self._proc_num - 2 self._verbose = verbose self._force_exit = ForceExitWrapper(self._manager.Value('b', False)) - self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size * - 3, self._manager) def generate_bucket_list(self, is_shuffle): if self._block_info_list is None: block_feature_info_lines = open(self._feature_file_list).readlines() - block_label_info_lines = open(self._label_file_list).readlines() - assert len(block_feature_info_lines) == len(block_label_info_lines) self._block_info_list = [] - for i in xrange(0, len(block_feature_info_lines), 2): - block_info = (block_feature_info_lines[i], - block_feature_info_lines[i + 1], - block_label_info_lines[i], - block_label_info_lines[i + 1]) - self._block_info_list.append( - map(lambda line: line.strip(), block_info)) + if self._label_file_list != "": + block_label_info_lines = open(self._label_file_list).readlines() + assert len(block_feature_info_lines) == len( + block_label_info_lines) + for i in xrange(0, len(block_feature_info_lines), 2): + block_info = (block_feature_info_lines[i], + block_feature_info_lines[i + 1], + block_label_info_lines[i], + block_label_info_lines[i + 1]) + self._block_info_list.append( + map(lambda line: line.strip(), block_info)) + else: + for i in xrange(0, len(block_feature_info_lines), 2): + block_info = (block_feature_info_lines[i], + block_feature_info_lines[i + 1], "", "") + self._block_info_list.append( + map(lambda line: line.strip(), block_info)) if is_shuffle: self._rng.shuffle(self._block_info_list) @@ -253,23 +268,13 @@ class AsyncDataReader(object): def set_transformers(self, transformers): self._transformers = transformers - def recycle(self, *args): - for shared_ndarray in args: - if not isinstance(shared_ndarray, SharedNDArray): - raise Value("Only support recycle SharedNDArray object.") - shared_ndarray.recycle(self._pool_manager.pool) - - def _start_async_processing(self): + def _sample_generator(self): sample_info_queue = self._manager.Queue(self._sample_info_buffer_size) sample_queue = self._manager.Queue(self._sample_buffer_size) self._order_id = 0 @suppress_complaints(verbose=self._verbose, notify=self._force_exit) def ordered_feeding_task(sample_info_queue): - if self._verbose == 0: - signal.signal(signal.SIGTERM, suppress_signal) - signal.signal(signal.SIGINT, suppress_signal) - for sample_info_bucket in self._bucket_list: try: sample_info_list = \ @@ -282,12 +287,13 @@ class AsyncDataReader(object): sample_info_queue.put((sample_info, self._order_id)) self._order_id += 1 - for i in xrange(self._sample_proc_num): + for i in xrange(self._proc_num): sample_info_queue.put(EpochEndSignal()) - feeding_proc = DaemonProcessGroup( - proc_num=1, target=ordered_feeding_task, args=(sample_info_queue, )) - feeding_proc.start_all() + feeding_thread = Thread( + target=ordered_feeding_task, args=(sample_info_queue, )) + feeding_thread.daemon = True + feeding_thread.start() @suppress_complaints(verbose=self._verbose, notify=self._force_exit) def ordered_processing_task(sample_info_queue, sample_queue, out_order): @@ -315,25 +321,32 @@ class AsyncDataReader(object): sample_info.feature_size) assert sample_info.feature_frame_num \ - * sample_info.feature_dim * 4 == len(feature_bytes), \ - (sample_info.feature_bin_path, - sample_info.feature_frame_num, - sample_info.feature_dim, - len(feature_bytes)) - - label_bytes = read_bytes(sample_info.label_bin_path, - sample_info.label_start, - sample_info.label_size) - - assert sample_info.label_frame_num * 4 == len(label_bytes), ( - sample_info.label_bin_path, sample_info.label_array, - len(label_bytes)) - - label_array = struct.unpack('I' * sample_info.label_frame_num, - label_bytes) - label_data = np.array( - label_array, dtype='int64').reshape( - (sample_info.label_frame_num, 1)) + * sample_info.feature_dim * 4 \ + == len(feature_bytes), \ + (sample_info.feature_bin_path, + sample_info.feature_frame_num, + sample_info.feature_dim, + len(feature_bytes)) + + label_data = None + if sample_info.label_bin_path != "": + label_bytes = read_bytes(sample_info.label_bin_path, + sample_info.label_start, + sample_info.label_size) + + assert sample_info.label_frame_num * 4 == len( + label_bytes), (sample_info.label_bin_path, + sample_info.label_array, + len(label_bytes)) + + label_array = struct.unpack( + 'I' * sample_info.label_frame_num, label_bytes) + label_data = np.array( + label_array, dtype='int64').reshape( + (sample_info.label_frame_num, 1)) + else: + label_data = np.zeros( + (sample_info.label_frame_num, 1), dtype='int64') feature_frame_num = sample_info.feature_frame_num feature_dim = sample_info.feature_dim @@ -343,12 +356,11 @@ class AsyncDataReader(object): feature_data = np.array( feature_array, dtype='float32').reshape(( sample_info.feature_frame_num, sample_info.feature_dim)) - - sample_data = (feature_data, label_data) + sample_data = (feature_data, label_data, + sample_info.sample_name) for transformer in self._transformers: # @TODO(pkuyym) to make transfomer only accept feature_data sample_data = transformer.perform_trans(sample_data) - while order_id != out_order[0]: time.sleep(0.001) @@ -364,71 +376,77 @@ class AsyncDataReader(object): out_order = self._manager.list([0]) args = (sample_info_queue, sample_queue, out_order) - sample_proc = DaemonProcessGroup( - proc_num=self._sample_proc_num, - target=ordered_processing_task, - args=args) - sample_proc.start_all() + workers = [ + Process( + target=ordered_processing_task, args=args) + for _ in xrange(self._proc_num) + ] - return sample_queue + for w in workers: + w.daemon = True + w.start() - def batch_iterator(self, batch_size, minimum_batch_size): - @suppress_complaints(verbose=self._verbose, notify=self._force_exit) - def batch_assembling_task(sample_queue, batch_queue, pool): - def conv_to_shared(ndarray): - while self._force_exit == False: - try: - (name, shared_ndarray) = pool.popitem() - except Exception as e: - time.sleep(0.001) + finished_proc_num = 0 + + while self._force_exit == False: + try: + sample = sample_queue.get_nowait() + except Queue.Empty: + time.sleep(0.001) + else: + if isinstance(sample, EpochEndSignal): + finished_proc_num += 1 + if finished_proc_num >= self._proc_num: + break else: - shared_ndarray.copy(ndarray) - return shared_ndarray + continue - if self._verbose == 0: - signal.signal(signal.SIGTERM, suppress_signal) - signal.signal(signal.SIGINT, suppress_signal) + yield sample + def batch_iterator(self, batch_size, minimum_batch_size): + def batch_to_ndarray(batch_samples, lod): + assert len(batch_samples) + frame_dim = batch_samples[0][0].shape[1] + batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32") + batch_label = np.zeros((lod[-1], 1), dtype="int64") + start = 0 + name_lst = [] + for sample in batch_samples: + frame_num = sample[0].shape[0] + batch_feature[start:start + frame_num, :] = sample[0] + batch_label[start:start + frame_num, :] = sample[1] + start += frame_num + name_lst.append(sample[2]) + return (batch_feature, batch_label, name_lst) + + @suppress_complaints(verbose=self._verbose, notify=self._force_exit) + def batch_assembling_task(sample_generator, batch_queue): batch_samples = [] lod = [0] - done_num = 0 - while done_num < self._sample_proc_num: - sample = sample_queue.get() - if isinstance(sample, EpochEndSignal): - done_num += 1 - else: - batch_samples.append(sample) - lod.append(lod[-1] + sample[0].shape[0]) - if len(batch_samples) == batch_size: - feature, label = batch_to_ndarray(batch_samples, lod) - - feature = conv_to_shared(feature) - label = conv_to_shared(label) - lod = conv_to_shared(np.array(lod).astype('int64')) - - batch_queue.put((feature, label, lod)) - batch_samples = [] - lod = [0] + for sample in sample_generator(): + batch_samples.append(sample) + lod.append(lod[-1] + sample[0].shape[0]) + if len(batch_samples) == batch_size: + (batch_feature, batch_label, name_lst) = batch_to_ndarray( + batch_samples, lod) + batch_queue.put((batch_feature, batch_label, lod, name_lst)) + batch_samples = [] + lod = [0] if len(batch_samples) >= minimum_batch_size: - (feature, label) = batch_to_ndarray(batch_samples, lod) - - feature = conv_to_shared(feature) - label = conv_to_shared(label) - lod = conv_to_shared(np.array(lod).astype('int64')) - - batch_queue.put((feature, label, lod)) + (batch_feature, batch_label, name_lst) = batch_to_ndarray( + batch_samples, lod) + batch_queue.put((batch_feature, batch_label, lod, name_lst)) batch_queue.put(EpochEndSignal()) - sample_queue = self._start_async_processing() - batch_queue = self._manager.Queue(self._batch_buffer_size) + batch_queue = Queue.Queue(self._batch_buffer_size) - assembling_proc = DaemonProcessGroup( - proc_num=1, + assembling_thread = Thread( target=batch_assembling_task, - args=(sample_queue, batch_queue, self._pool_manager.pool)) - assembling_proc.start_all() + args=(self._sample_generator, batch_queue)) + assembling_thread.daemon = True + assembling_thread.start() while self._force_exit == False: try: diff --git a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py index 157ab02eee0093fe5d683e642b3d18d842cb4e19..9f76a9f8590d5f148398c4ffaff77dc95421df83 100644 --- a/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py +++ b/fluid/DeepASR/data_utils/augmentor/tests/test_data_trans.py @@ -22,7 +22,7 @@ class TestTransMeanVarianceNorm(unittest.TestCase): feature = np.zeros((2, 120), dtype="float32") feature.fill(1) trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path) - (feature1, label1) = trans.perform_trans((feature, None)) + (feature1, label1, name) = trans.perform_trans((feature, None, None)) (mean, var) = trans.get_mean_var() feature_flat1 = feature1.flatten() feature_flat = feature.flatten() @@ -70,7 +70,7 @@ class TestTransAddDelta(unittest.TestCase): feature[2, 0:40].fill(3) feature[3, 0:40].fill(4) trans = trans_add_delta.TransAddDelta() - (feature, label) = trans.perform_trans((feature, None)) + (feature, label, name) = trans.perform_trans((feature, None, None)) self.assertAlmostEqual(feature.shape[0], 4) self.assertAlmostEqual(feature.shape[1], 120) self.assertAlmostEqual(1.0, feature[0][0]) @@ -93,7 +93,7 @@ class TestTransSplict(unittest.TestCase): feature[i, :].fill(i) trans = trans_splice.TransSplice() - (feature, label) = trans.perform_trans((feature, None)) + (feature, label, name) = trans.perform_trans((feature, None, None)) self.assertEqual(feature.shape[1], 110) for i in xrange(8): diff --git a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py index dc1a4fa45be38152eba773c35e67d0ad3e4a13cb..aa8062f87c932b76dd8a79db825d07e8be273857 100644 --- a/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py +++ b/fluid/DeepASR/data_utils/augmentor/trans_add_delta.py @@ -32,9 +32,9 @@ class TransAddDelta(object): Args: sample(object,tuple): contain feature numpy and label numpy Returns: - (feature, label) + (feature, label, name) """ - (feature, label) = sample + (feature, label, name) = sample frame_dim = feature.shape[1] d_frame_dim = frame_dim * 3 head_filled = 5 @@ -64,7 +64,7 @@ class TransAddDelta(object): start * d_frame_dim + 2 * frame_dim, frame_dim, nframe, d_frame_dim) mat.shape = tmp_shape - return (mat[head_filled:mat.shape[0] - tail_filled, :], label) + return (mat[head_filled:mat.shape[0] - tail_filled, :], label, name) def _regress(self, data_in, start_in, data_out, start_out, size, n, step): """ regress diff --git a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py index 5b541d426c61364639f7a9d9f50bd51a2c06efa5..9f91b726ea2bcd432340cd06a3cb9006cd5f83f4 100644 --- a/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py +++ b/fluid/DeepASR/data_utils/augmentor/trans_mean_variance_norm.py @@ -53,9 +53,9 @@ class TransMeanVarianceNorm(object): Args: sample(object):input sample, contain feature numpy and label numpy Returns: - (feature, label) + (feature, label, name) """ - (feature, label) = sample + (feature, label, name) = sample shape = feature.shape assert len(shape) == 2 nfeature_len = shape[0] * shape[1] @@ -68,4 +68,4 @@ class TransMeanVarianceNorm(object): feature[ncur_idx:ncur_idx + self._nLen] = block ncur_idx += self._nLen feature = feature.reshape(shape) - return (feature, label) + return (feature, label, name) diff --git a/fluid/DeepASR/data_utils/augmentor/trans_splice.py b/fluid/DeepASR/data_utils/augmentor/trans_splice.py index 94f5258de316045d41999b26c6963f8487e9c55a..1fab3d6b442c1613f18d16fd0b0ee89464dbeb2c 100644 --- a/fluid/DeepASR/data_utils/augmentor/trans_splice.py +++ b/fluid/DeepASR/data_utils/augmentor/trans_splice.py @@ -30,9 +30,9 @@ class TransSplice(object): Args: sample(object): input sample(feature, label) Return: - (feature, label) + (feature, label, name) """ - (feature, label) = sample + (feature, label, name) = sample nframe_num = feature.shape[0] nframe_dim = feature.shape[1] nnew_frame_dim = nframe_dim * ( @@ -61,4 +61,4 @@ class TransSplice(object): np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim], mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim]) ret = ret.reshape((nframe_num, nnew_frame_dim)) - return (ret, label) + return (ret, label, name) diff --git a/fluid/DeepASR/data_utils/util.py b/fluid/DeepASR/data_utils/util.py index e8ccbadc0bf2106ccabd73a449eb5e53983ccf95..0a48f4696547377dbe89934355e8eaac38966fab 100644 --- a/fluid/DeepASR/data_utils/util.py +++ b/fluid/DeepASR/data_utils/util.py @@ -4,8 +4,6 @@ from __future__ import print_function import sys from six import reraise from tblib import Traceback -from multiprocessing import Manager, Process -import posix_ipc, mmap import numpy as np @@ -37,19 +35,6 @@ def lodtensor_to_ndarray(lod_tensor): return ret, lod_tensor.lod() -def batch_to_ndarray(batch_samples, lod): - frame_dim = batch_samples[0][0].shape[1] - batch_feature = np.zeros((lod[-1], frame_dim), dtype="float32") - batch_label = np.zeros((lod[-1], 1), dtype="int64") - start = 0 - for sample in batch_samples: - frame_num = sample[0].shape[0] - batch_feature[start:start + frame_num, :] = sample[0] - batch_label[start:start + frame_num, :] = sample[1] - start += frame_num - return (batch_feature, batch_label) - - def split_infer_result(infer_seq, lod): infer_batch = [] for i in xrange(0, len(lod[0]) - 1): @@ -57,126 +42,10 @@ def split_infer_result(infer_seq, lod): return infer_batch -class DaemonProcessGroup(object): - def __init__(self, proc_num, target, args): - self._proc_num = proc_num - self._workers = [ - Process( - target=target, args=args) for _ in xrange(self._proc_num) - ] - - def start_all(self): - for w in self._workers: - w.daemon = True - w.start() - - @property - def proc_num(self): - return self._proc_num - - -class EpochEndSignal(object): - pass - - class CriticalException(Exception): pass -class SharedNDArray(object): - """SharedNDArray utilizes shared memory to avoid data serialization when - data object shared among different processes. We can reconstruct the - `ndarray` when memory address, shape and dtype provided. - - Args: - name (str): Address name of shared memory. - whether_verify (bool): Whether to validate the writing operation. - """ - - def __init__(self, name, whether_verify=False): - self._name = name - self._shm = None - self._buf = None - self._array = np.zeros(1, dtype=np.float32) - self._inited = False - self._whether_verify = whether_verify - - def zeros_like(self, shape, dtype): - size = int(np.prod(shape)) * np.dtype(dtype).itemsize - if self._inited: - self._shm = posix_ipc.SharedMemory(self._name) - else: - self._shm = posix_ipc.SharedMemory( - self._name, posix_ipc.O_CREAT, size=size) - self._buf = mmap.mmap(self._shm.fd, size) - self._array = np.ndarray(shape, dtype, self._buf, order='C') - - def copy(self, ndarray): - size = int(np.prod(ndarray.shape)) * np.dtype(ndarray.dtype).itemsize - self.zeros_like(ndarray.shape, ndarray.dtype) - self._array[:] = ndarray - self._buf.flush() - self._inited = True - - if self._whether_verify: - shm = posix_ipc.SharedMemory(self._name) - buf = mmap.mmap(shm.fd, size) - array = np.ndarray(ndarray.shape, ndarray.dtype, buf, order='C') - np.testing.assert_array_equal(array, ndarray) - - @property - def ndarray(self): - return self._array - - def recycle(self, pool): - self._buf.close() - self._shm.close_fd() - self._inited = False - pool[self._name] = self - - def __getstate__(self): - return (self._name, self._array.shape, self._array.dtype, self._inited, - self._whether_verify) - - def __setstate__(self, state): - self._name = state[0] - self._inited = state[3] - self.zeros_like(state[1], state[2]) - self._whether_verify = state[4] - - -class SharedMemoryPoolManager(object): - """SharedMemoryPoolManager maintains a multiprocessing.Manager.dict object. - All available addresses are allocated once and will be reused. Though this - class is not process-safe, the pool can be shared between processes. All - shared memory should be unlinked before the main process exited. - - Args: - pool_size (int): Size of shared memory pool. - manager (dict): A multiprocessing.Manager object, the pool is - maintained by the proxy process. - name_prefix (str): Address prefix of shared memory. - """ - - def __init__(self, pool_size, manager, name_prefix='/deep_asr'): - self._names = [] - self._dict = manager.dict() - - for i in xrange(pool_size): - name = name_prefix + '_' + str(i) - self._dict[name] = SharedNDArray(name) - self._names.append(name) - - @property - def pool(self): - return self._dict - - def __del__(self): - for name in self._names: - # have to unlink the shared memory - posix_ipc.unlink_shared_memory(name) - - def suppress_signal(signo, stack_frame): pass diff --git a/fluid/DeepASR/decoder/decoder.cc b/fluid/DeepASR/decoder/decoder.cc deleted file mode 100644 index a99f972e2fc2341247eb2a6aa564d8d6b5e2905d..0000000000000000000000000000000000000000 --- a/fluid/DeepASR/decoder/decoder.cc +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "decoder.h" - -std::string decode(std::vector> probs_mat) { - // Add decoding logic here - - return "example decoding result"; -} diff --git a/fluid/DeepASR/decoder/decoder.h b/fluid/DeepASR/decoder/decoder.h deleted file mode 100644 index 4a67fa366ae31dd393d0e3b2f04d27e360596fe5..0000000000000000000000000000000000000000 --- a/fluid/DeepASR/decoder/decoder.h +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -std::string decode(std::vector> probs_mat); diff --git a/fluid/DeepASR/decoder/post_decode_faster.cc b/fluid/DeepASR/decoder/post_decode_faster.cc new file mode 100644 index 0000000000000000000000000000000000000000..ce2b45bc6cecec5466f3d20841e5b8ba38151a6c --- /dev/null +++ b/fluid/DeepASR/decoder/post_decode_faster.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "post_decode_faster.h" + +typedef kaldi::int32 int32; +using fst::SymbolTable; +using fst::VectorFst; +using fst::StdArc; + +Decoder::Decoder(std::string word_syms_filename, + std::string fst_in_filename, + std::string logprior_rxfilename, + kaldi::BaseFloat acoustic_scale) { + const char* usage = + "Decode, reading log-likelihoods (of transition-ids or whatever symbol " + "is on the graph) as matrices."; + + kaldi::ParseOptions po(usage); + binary = true; + this->acoustic_scale = acoustic_scale; + allow_partial = true; + kaldi::FasterDecoderOptions decoder_opts; + decoder_opts.Register(&po, true); // true == include obscure settings. + po.Register("binary", &binary, "Write output in binary mode"); + po.Register("allow-partial", + &allow_partial, + "Produce output even when final state was not reached"); + po.Register("acoustic-scale", + &acoustic_scale, + "Scaling factor for acoustic likelihoods"); + + word_syms = NULL; + if (word_syms_filename != "") { + word_syms = fst::SymbolTable::ReadText(word_syms_filename); + if (!word_syms) + KALDI_ERR << "Could not read symbol table from file " + << word_syms_filename; + } + + std::ifstream is_logprior(logprior_rxfilename); + logprior.Read(is_logprior, false); + + // It's important that we initialize decode_fst after loglikes_reader, as it + // can prevent crashes on systems installed without enough virtual memory. + // It has to do with what happens on UNIX systems if you call fork() on a + // large process: the page-table entries are duplicated, which requires a + // lot of virtual memory. + decode_fst = fst::ReadFstKaldi(fst_in_filename); + + decoder = new kaldi::FasterDecoder(*decode_fst, decoder_opts); +} + + +Decoder::~Decoder() { + if (!word_syms) delete word_syms; + delete decode_fst; + delete decoder; +} + +std::string Decoder::decode( + std::string key, + const std::vector>& log_probs) { + size_t num_frames = log_probs.size(); + size_t dim_label = log_probs[0].size(); + + kaldi::Matrix loglikes( + num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols); + for (size_t i = 0; i < num_frames; ++i) { + memcpy(loglikes.Data() + i * dim_label, + log_probs[i].data(), + sizeof(kaldi::BaseFloat) * dim_label); + } + + return decode(key, loglikes); +} + + +std::vector Decoder::decode(std::string posterior_rspecifier) { + kaldi::SequentialBaseFloatMatrixReader posterior_reader(posterior_rspecifier); + std::vector decoding_results; + + for (; !posterior_reader.Done(); posterior_reader.Next()) { + std::string key = posterior_reader.Key(); + kaldi::Matrix loglikes(posterior_reader.Value()); + + decoding_results.push_back(decode(key, loglikes)); + } + + return decoding_results; +} + + +std::string Decoder::decode(std::string key, + kaldi::Matrix& loglikes) { + std::string decoding_result; + + if (loglikes.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << key; + } + KALDI_ASSERT(loglikes.NumCols() == logprior.Dim()); + + loglikes.ApplyLog(); + loglikes.AddVecToRows(-1.0, logprior); + + kaldi::DecodableMatrixScaled decodable(loglikes, acoustic_scale); + decoder->Decode(&decodable); + + VectorFst decoded; // linear FST. + + if ((allow_partial || decoder->ReachedFinal()) && + decoder->GetBestPath(&decoded)) { + if (!decoder->ReachedFinal()) + KALDI_WARN << "Decoder did not reach end-state, outputting partial " + "traceback."; + + std::vector alignment; + std::vector words; + kaldi::LatticeWeight weight; + + GetLinearSymbolSequence(decoded, &alignment, &words, &weight); + + if (word_syms != NULL) { + for (size_t i = 0; i < words.size(); i++) { + std::string s = word_syms->Find(words[i]); + decoding_result += s; + if (s == "") + KALDI_ERR << "Word-id " << words[i] << " not in symbol table."; + } + } + } + + return decoding_result; +} diff --git a/fluid/DeepASR/decoder/post_decode_faster.h b/fluid/DeepASR/decoder/post_decode_faster.h new file mode 100644 index 0000000000000000000000000000000000000000..8bade8d6988f02ef4caab8ecf6fc50209aa3642a --- /dev/null +++ b/fluid/DeepASR/decoder/post_decode_faster.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "base/kaldi-common.h" +#include "base/timer.h" +#include "decoder/decodable-matrix.h" +#include "decoder/faster-decoder.h" +#include "fstext/fstext-lib.h" +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" // for {Compact}LatticeArc +#include "tree/context-dep.h" +#include "util/common-utils.h" + + +class Decoder { +public: + Decoder(std::string word_syms_filename, + std::string fst_in_filename, + std::string logprior_rxfilename, + kaldi::BaseFloat acoustic_scale); + ~Decoder(); + + // Interface to accept the scores read from specifier and return + // the batch decoding results + std::vector decode(std::string posterior_rspecifier); + + // Accept the scores of one utterance and return the decoding result + std::string decode( + std::string key, + const std::vector> &log_probs); + +private: + // For decoding one utterance + std::string decode(std::string key, + kaldi::Matrix &loglikes); + + fst::SymbolTable *word_syms; + fst::VectorFst *decode_fst; + kaldi::FasterDecoder *decoder; + kaldi::Vector logprior; + + bool binary; + kaldi::BaseFloat acoustic_scale; + bool allow_partial; +}; diff --git a/fluid/DeepASR/decoder/pybind.cc b/fluid/DeepASR/decoder/pybind.cc index 8cd65903eae63268d9f4412bd737162c639d8910..90ea38ffb535677dc66d74fc64ff3fe4a27bf824 100644 --- a/fluid/DeepASR/decoder/pybind.cc +++ b/fluid/DeepASR/decoder/pybind.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,15 +15,25 @@ limitations under the License. */ #include #include -#include "decoder.h" +#include "post_decode_faster.h" namespace py = pybind11; -PYBIND11_MODULE(decoder, m) { - m.doc() = "Decode function for Deep ASR model"; - - m.def("decode", - &decode, - "Decode one input probability matrix " - "and return the transcription"); +PYBIND11_MODULE(post_decode_faster, m) { + m.doc() = "Decoder for Deep ASR model"; + + py::class_(m, "Decoder") + .def(py::init()) + .def("decode", + (std::vector (Decoder::*)(std::string)) & + Decoder::decode, + "Decode for the probability matrices in specifier " + "and return the transcriptions.") + .def( + "decode", + (std::string (Decoder::*)( + std::string, const std::vector>&)) & + Decoder::decode, + "Decode one input probability matrix " + "and return the transcription."); } diff --git a/fluid/DeepASR/decoder/setup.py b/fluid/DeepASR/decoder/setup.py index cedd5d644e0dc1ca8855ab7e75ee1b7a30f8fcb1..a98c0b4cc17717a6769b8322e4f5afe3de6ab2de 100644 --- a/fluid/DeepASR/decoder/setup.py +++ b/fluid/DeepASR/decoder/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,27 +13,57 @@ # limitations under the License. import os +import glob from distutils.core import setup, Extension from distutils.sysconfig import get_config_vars -args = ['-std=c++11'] +try: + kaldi_root = os.environ['KALDI_ROOT'] +except: + raise ValueError("Enviroment variable 'KALDI_ROOT' is not defined. Please " + "install kaldi and export KALDI_ROOT= .") + +args = [ + '-std=c++11', '-Wno-sign-compare', '-Wno-unused-variable', + '-Wno-unused-local-typedefs', '-Wno-unused-but-set-variable', + '-Wno-deprecated-declarations', '-Wno-unused-function' +] # remove warning about -Wstrict-prototypes (opt, ) = get_config_vars('OPT') os.environ['OPT'] = " ".join(flag for flag in opt.split() if flag != '-Wstrict-prototypes') +os.environ['CC'] = 'g++' + +LIBS = [ + 'fst', 'kaldi-base', 'kaldi-util', 'kaldi-matrix', 'kaldi-tree', + 'kaldi-hmm', 'kaldi-fstext', 'kaldi-decoder', 'kaldi-lat' +] + +LIB_DIRS = [ + 'tools/openfst/lib', 'src/base', 'src/matrix', 'src/util', 'src/tree', + 'src/hmm', 'src/fstext', 'src/decoder', 'src/lat' +] +LIB_DIRS = [os.path.join(kaldi_root, path) for path in LIB_DIRS] +LIB_DIRS = [os.path.abspath(path) for path in LIB_DIRS] ext_modules = [ Extension( - 'decoder', - ['pybind.cc', 'decoder.cc'], - include_dirs=['pybind11/include', '.'], + 'post_decode_faster', + ['pybind.cc', 'post_decode_faster.cc'], + include_dirs=[ + 'pybind11/include', '.', os.path.join(kaldi_root, 'src'), + os.path.join(kaldi_root, 'tools/openfst/src/include') + ], language='c++', + libraries=LIBS, + library_dirs=LIB_DIRS, + runtime_library_dirs=LIB_DIRS, extra_compile_args=args, ), ] setup( - name='decoder', + name='post_decode_faster', version='0.0.1', author='Paddle', author_email='', diff --git a/fluid/DeepASR/decoder/setup.sh b/fluid/DeepASR/decoder/setup.sh index 71fd6626efe1b7cf72a1e678ab7b74000ebfb8c3..1471f85f414ae8dd5230f04cf08da282adc3b0b7 100644 --- a/fluid/DeepASR/decoder/setup.sh +++ b/fluid/DeepASR/decoder/setup.sh @@ -1,4 +1,4 @@ - +set -e if [ ! -d pybind11 ]; then git clone https://github.com/pybind/pybind11.git diff --git a/fluid/DeepASR/infer.py b/fluid/DeepASR/infer.py index babcb416ea884081ae249a8d1dc177f85cf1c9ba..84269261a95c381a9be21425abf43b98006f0886 100644 --- a/fluid/DeepASR/infer.py +++ b/fluid/DeepASR/infer.py @@ -8,7 +8,7 @@ import paddle.fluid as fluid import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm import data_utils.augmentor.trans_add_delta as trans_add_delta import data_utils.augmentor.trans_splice as trans_splice -import data_utils.data_reader as reader +import data_utils.async_data_reader as reader from data_utils.util import lodtensor_to_ndarray from data_utils.util import split_infer_result @@ -79,12 +79,13 @@ def infer(args): trans_splice.TransSplice() ] - infer_data_reader = reader.DataReader(args.infer_feature_lst, - args.infer_label_lst) + infer_data_reader = reader.AsyncDataReader(args.infer_feature_lst, + args.infer_label_lst) infer_data_reader.set_transformers(ltrans) feature_t = fluid.LoDTensor() one_batch = infer_data_reader.batch_iterator(args.batch_size, 1).next() + (features, labels, lod) = one_batch feature_t.set(features, place) feature_t.set_lod([lod]) diff --git a/fluid/DeepASR/infer_by_ckpt.py b/fluid/DeepASR/infer_by_ckpt.py index 68dd573647d498704fd22f70a7df2255e7ac66cd..bf6093acb8e14ec926d1aefb759207905e468f8d 100644 --- a/fluid/DeepASR/infer_by_ckpt.py +++ b/fluid/DeepASR/infer_by_ckpt.py @@ -13,7 +13,7 @@ import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm import data_utils.augmentor.trans_add_delta as trans_add_delta import data_utils.augmentor.trans_splice as trans_splice import data_utils.async_data_reader as reader -import decoder.decoder as decoder +from decoder.post_decode_faster import Decoder from data_utils.util import lodtensor_to_ndarray from model_utils.model import stacked_lstmp_model from data_utils.util import split_infer_result @@ -32,6 +32,11 @@ def parse_args(): default=1, help='The minimum sequence number of a batch data. ' '(default: %(default)d)') + parser.add_argument( + '--frame_dim', + type=int, + default=120 * 11, + help='Frame dimension of feature data. (default: %(default)d)') parser.add_argument( '--stacked_num', type=int, @@ -47,6 +52,11 @@ def parse_args(): type=int, default=1024, help='Hidden size of lstmp unit. (default: %(default)d)') + parser.add_argument( + '--class_num', + type=int, + default=1749, + help='Number of classes in label. (default: %(default)d)') parser.add_argument( '--learning_rate', type=float, @@ -81,6 +91,26 @@ def parse_args(): type=str, default='./checkpoint', help="The checkpoint path to init model. (default: %(default)s)") + parser.add_argument( + '--vocabulary', + type=str, + default='./decoder/graph/words.txt', + help="The path to vocabulary. (default: %(default)s)") + parser.add_argument( + '--graphs', + type=str, + default='./decoder/graph/TLG.fst', + help="The path to TLG graphs for decoding. (default: %(default)s)") + parser.add_argument( + '--log_prior', + type=str, + default="./decoder/logprior", + help="The log prior probs for training data. (default: %(default)s)") + parser.add_argument( + '--acoustic_scale', + type=float, + default=0.2, + help="Scaling factor for acoustic likelihoods. (default: %(default)f)") args = parser.parse_args() return args @@ -99,10 +129,11 @@ def infer_from_ckpt(args): raise IOError("Invalid checkpoint!") prediction, avg_cost, accuracy = stacked_lstmp_model( + frame_dim=args.frame_dim, hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, stacked_num=args.stacked_num, - class_num=1749, + class_num=args.class_num, parallel=args.parallel) infer_program = fluid.default_main_program().clone() @@ -117,6 +148,10 @@ def infer_from_ckpt(args): # load checkpoint. fluid.io.load_persistables(exe, args.checkpoint) + # init decoder + decoder = Decoder(args.vocabulary, args.graphs, args.log_prior, + args.acoustic_scale) + ltrans = [ trans_add_delta.TransAddDelta(2, 2), trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var), @@ -136,12 +171,10 @@ def infer_from_ckpt(args): args.minimum_batch_size)): # load_data (features, labels, lod) = batch_data - feature_t.set(features.ndarray, place) - feature_t.set_lod([lod.ndarray]) - label_t.set(labels.ndarray, place) - label_t.set_lod([lod.ndarray]) - - infer_data_reader.recycle(features, labels, lod) + feature_t.set(features, place) + feature_t.set_lod([lod]) + label_t.set(labels, place) + label_t.set_lod([lod]) results = exe.run(infer_program, feed={"feature": feature_t, @@ -154,8 +187,8 @@ def infer_from_ckpt(args): probs, lod = lodtensor_to_ndarray(results[0]) infer_batch = split_infer_result(probs, lod) for index, sample in enumerate(infer_batch): - print("Decoding %d: " % (batch_id * args.batch_size + index), - decoder.decode(sample)) + key = "utter#%d" % (batch_id * args.batch_size + index) + print(key, ": ", decoder.decode(key, sample).encode("utf8"), "\n") print(np.mean(infer_costs), np.mean(infer_accs)) diff --git a/fluid/DeepASR/model_utils/model.py b/fluid/DeepASR/model_utils/model.py index 541f869c7224e620c519c97472dbe79ca73bd84b..8fb7596e122447979cf392d6610ad2b7281d195b 100644 --- a/fluid/DeepASR/model_utils/model.py +++ b/fluid/DeepASR/model_utils/model.py @@ -6,7 +6,8 @@ import paddle.v2 as paddle import paddle.fluid as fluid -def stacked_lstmp_model(hidden_dim, +def stacked_lstmp_model(frame_dim, + hidden_dim, proj_dim, stacked_num, class_num, @@ -20,12 +21,13 @@ def stacked_lstmp_model(hidden_dim, label data respectively. And in inference, only `feature` is needed. Args: - hidden_dim(int): The hidden state's dimension of the LSTMP layer. - proj_dim(int): The projection size of the LSTMP layer. - stacked_num(int): The number of stacked LSTMP layers. - parallel(bool): Run in parallel or not, default `False`. - is_train(bool): Run in training phase or not, default `True`. - class_dim(int): The number of output classes. + frame_dim(int): The frame dimension of feature data. + hidden_dim(int): The hidden state's dimension of the LSTMP layer. + proj_dim(int): The projection size of the LSTMP layer. + stacked_num(int): The number of stacked LSTMP layers. + parallel(bool): Run in parallel or not, default `False`. + is_train(bool): Run in training phase or not, default `True`. + class_dim(int): The number of output classes. """ # network configuration @@ -78,7 +80,7 @@ def stacked_lstmp_model(hidden_dim, # data feeder feature = fluid.layers.data( - name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1) + name="feature", shape=[-1, frame_dim], dtype="float32", lod_level=1) label = fluid.layers.data( name="label", shape=[-1, 1], dtype="int64", lod_level=1) @@ -92,11 +94,12 @@ def stacked_lstmp_model(hidden_dim, feat_ = pd.read_input(feature) label_ = pd.read_input(label) prediction, avg_cost, acc = _net_conf(feat_, label_) - for out in [avg_cost, acc]: + for out in [prediction, avg_cost, acc]: pd.write_output(out) # get mean loss and acc through every devices. - avg_cost, acc = pd() + prediction, avg_cost, acc = pd() + prediction.stop_gradient = True avg_cost = fluid.layers.mean(x=avg_cost) acc = fluid.layers.mean(x=acc) else: diff --git a/fluid/DeepASR/tools/profile.py b/fluid/DeepASR/tools/profile.py index 77dff3cb371659ce672e10735174b846827c9d6b..69aee88e22d33ed80212692bf61e41e1666bf5e5 100644 --- a/fluid/DeepASR/tools/profile.py +++ b/fluid/DeepASR/tools/profile.py @@ -31,6 +31,11 @@ def parse_args(): default=1, help='The minimum sequence number of a batch data. ' '(default: %(default)d)') + parser.add_argument( + '--frame_dim', + type=int, + default=120 * 11, + help='Frame dimension of feature data. (default: %(default)d)') parser.add_argument( '--stacked_num', type=int, @@ -46,6 +51,11 @@ def parse_args(): type=int, default=1024, help='Hidden size of lstmp unit. (default: %(default)d)') + parser.add_argument( + '--class_num', + type=int, + default=1749, + help='Number of classes in label. (default: %(default)d)') parser.add_argument( '--learning_rate', type=float, @@ -119,10 +129,11 @@ def profile(args): "arg 'first_batches_to_skip' must not be smaller than 0.") _, avg_cost, accuracy = stacked_lstmp_model( + frame_dim=args.frame_dim, hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, stacked_num=args.stacked_num, - class_num=1749, + class_num=args.class_num, parallel=args.parallel) optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) @@ -158,14 +169,12 @@ def profile(args): frames_seen = 0 # load_data (features, labels, lod) = batch_data - feature_t.set(features.ndarray, place) - feature_t.set_lod([lod.ndarray]) - label_t.set(labels.ndarray, place) - label_t.set_lod([lod.ndarray]) - - frames_seen += lod.ndarray[-1] + feature_t.set(features, place) + feature_t.set_lod([lod]) + label_t.set(labels, place) + label_t.set_lod([lod]) - data_reader.recycle(features, labels, lod) + frames_seen += lod[-1] outs = exe.run(fluid.default_main_program(), feed={"feature": feature_t, diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py index 8297ab7403775125b6da372c4d916e5e3111b669..3908a550cdcf095057ea6ab0b89e07dcecda51f9 100644 --- a/fluid/DeepASR/train.py +++ b/fluid/DeepASR/train.py @@ -30,6 +30,11 @@ def parse_args(): default=1, help='The minimum sequence number of a batch data. ' '(default: %(default)d)') + parser.add_argument( + '--frame_dim', + type=int, + default=120 * 11, + help='Frame dimension of feature data. (default: %(default)d)') parser.add_argument( '--stacked_num', type=int, @@ -45,6 +50,11 @@ def parse_args(): type=int, default=1024, help='Hidden size of lstmp unit. (default: %(default)d)') + parser.add_argument( + '--class_num', + type=int, + default=1749, + help='Number of classes in label. (default: %(default)d)') parser.add_argument( '--pass_num', type=int, @@ -137,10 +147,11 @@ def train(args): os.mkdir(args.infer_models) prediction, avg_cost, accuracy = stacked_lstmp_model( + frame_dim=args.frame_dim, hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, stacked_num=args.stacked_num, - class_num=1749, + class_num=args.class_num, parallel=args.parallel) # program for test @@ -182,12 +193,10 @@ def train(args): args.minimum_batch_size)): # load_data (features, labels, lod) = batch_data - feature_t.set(features.ndarray, place) - feature_t.set_lod([lod.ndarray]) - label_t.set(labels.ndarray, place) - label_t.set_lod([lod.ndarray]) - - test_data_reader.recycle(features, labels, lod) + feature_t.set(features, place) + feature_t.set_lod([lod]) + label_t.set(labels, place) + label_t.set_lod([lod]) cost, acc = exe.run(test_program, feed={"feature": feature_t, @@ -201,6 +210,7 @@ def train(args): # train data reader train_data_reader = reader.AsyncDataReader(args.train_feature_lst, args.train_label_lst, -1) + train_data_reader.set_transformers(ltrans) # train for pass_id in xrange(args.pass_num): @@ -209,13 +219,11 @@ def train(args): train_data_reader.batch_iterator(args.batch_size, args.minimum_batch_size)): # load_data - (features, labels, lod) = batch_data - feature_t.set(features.ndarray, place) - feature_t.set_lod([lod.ndarray]) - label_t.set(labels.ndarray, place) - label_t.set_lod([lod.ndarray]) - - train_data_reader.recycle(features, labels, lod) + (features, labels, lod, name_lst) = batch_data + feature_t.set(features, place) + feature_t.set_lod([lod]) + label_t.set(labels, place) + label_t.set_lod([lod]) to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0) outs = exe.run(fluid.default_main_program(), diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md index 279b4c6e57a785736a1c75928de8d45f4e4e956e..5f565afe0c33db291092faeac632da3d51f95613 100644 --- a/fluid/image_classification/caffe2fluid/README.md +++ b/fluid/image_classification/caffe2fluid/README.md @@ -2,7 +2,8 @@ This tool is used to convert a Caffe model to Fluid model ### Howto -1, Prepare caffepb.py in ./proto, two options provided +1, Prepare caffepb.py in ./proto if your python has no 'pycaffe' module, two options provided here: + 1) generate it from caffe.proto using protoc bash ./proto/compile.sh @@ -12,14 +13,24 @@ This tool is used to convert a Caffe model to Fluid model 2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file 3, Use the converted model to predict - see more detail info in 'tests/lenet/README.md' + + see more detail info in 'examples/xxx' -### Supported models +### Tested models - Lenet on mnist dataset - ResNets:(ResNet-50, ResNet-101, ResNet-152) - model addrs:(https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777) + model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_ + +- GoogleNet: + model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_ + +- VGG: + model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_ + +- AlexNet: + model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_ ### Notes Some of this code come from here: https://github.com/ethereon/caffe-tensorflow diff --git a/fluid/image_classification/caffe2fluid/convert.py b/fluid/image_classification/caffe2fluid/convert.py index 68a9e4f7e490a69c1b582d6fc14b2015bfdf9536..379f1a26368c9ffa4a9f82dad499ad7114f942fc 100755 --- a/fluid/image_classification/caffe2fluid/convert.py +++ b/fluid/image_classification/caffe2fluid/convert.py @@ -4,8 +4,8 @@ import os import sys import numpy as np import argparse -from kaffe import KaffeError, print_stderr +from kaffe import KaffeError, print_stderr from kaffe.paddle import Transformer @@ -47,6 +47,8 @@ def convert(def_path, caffemodel_path, data_output_path, code_output_path, except KaffeError as err: fatal_error('Error encountered: {}'.format(err)) + return 0 + def main(): """ main @@ -64,9 +66,10 @@ def main(): help='The phase to convert: test (default) or train') args = parser.parse_args() validate_arguments(args) - convert(args.def_path, args.caffemodel, args.data_output_path, - args.code_output_path, args.phase) + return convert(args.def_path, args.caffemodel, args.data_output_path, + args.code_output_path, args.phase) if __name__ == '__main__': - main() + ret = main() + sys.exit(ret) diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/README.md b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b82050859239be8804ddec8e2054edc38c4ac052 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/README.md @@ -0,0 +1,10 @@ +a demo to show converting caffe models on 'imagenet' using caffe2fluid + +--- + +# How to use + +1. prepare python environment +2. download caffe model to "models.caffe/xxx" which contains "xxx.caffemodel" and "xxx.prototxt" +3. run the tool + eg: bash ./run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50 diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg b/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..fd3a93f59385d6ff632483646e6caee300b56d09 Binary files /dev/null and b/fluid/image_classification/caffe2fluid/examples/imagenet/data/65.jpeg differ diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..ec594199be5a3e7a33c9673b1d5497c95f20d946 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py @@ -0,0 +1,142 @@ +#!/bin/env python + +#function: +# a demo to show how to use the converted model genereated by caffe2fluid +# +#notes: +# only support imagenet data + +import os +import sys +import inspect +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid + + +def load_data(imgfile, shape): + h, w = shape[1:] + from PIL import Image + im = Image.open(imgfile) + + # The storage order of the loaded image is W(widht), + # H(height), C(channel). PaddlePaddle requires + # the CHW order, so transpose them. + im = im.resize((w, h), Image.ANTIALIAS) + im = np.array(im).astype(np.float32) + im = im.transpose((2, 0, 1)) # CHW + im = im[(2, 1, 0), :, :] # BGR + + # The mean to be subtracted from each image. + # By default, the per-channel ImageNet mean. + mean = np.array([104., 117., 124.], dtype=np.float32) + mean = mean.reshape([3, 1, 1]) + im = im - mean + return im.reshape([1] + shape) + + +def build_model(net_file, net_name): + print('build model with net_file[%s] and net_name[%s]' % + (net_file, net_name)) + + net_path = os.path.dirname(net_file) + module_name = os.path.basename(net_file).rstrip('.py') + if net_path not in sys.path: + sys.path.insert(0, net_path) + + try: + m = __import__(module_name, fromlist=[net_name]) + MyNet = getattr(m, net_name) + except Exception as e: + print('failed to load module[%s]' % (module_name)) + print(e) + return None + + input_name = 'data' + input_shape = MyNet.input_shapes()[input_name] + images = fluid.layers.data(name='image', shape=input_shape, dtype='float32') + #label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + net = MyNet({input_name: images}) + input_shape = MyNet.input_shapes()[input_name] + return net, input_shape + + +def dump_results(results, names, root): + if os.path.exists(root) is False: + os.path.mkdir(root) + + for i in range(len(names)): + n = names[i] + res = results[i] + filename = os.path.join(root, n) + np.save(filename + '.npy', res) + + +def infer(net_file, net_name, model_file, imgfile, debug=False): + """ do inference using a model which consist 'xxx.py' and 'xxx.npy' + """ + #1, build model + net, input_shape = build_model(net_file, net_name) + prediction = net.get_output() + + #2, load weights for this model + place = fluid.CPUPlace() + exe = fluid.Executor(place) + startup_program = fluid.default_startup_program() + exe.run(startup_program) + + if model_file.find('.npy') > 0: + net.load(data_path=model_file, exe=exe, place=place) + else: + net.load(data_path=model_file, exe=exe) + + #3, test this model + test_program = fluid.default_main_program().clone() + + fetch_list_var = [] + fetch_list_name = [] + if debug is False: + fetch_list_var.append(prediction) + else: + for k, v in net.layers.items(): + fetch_list_var.append(v) + fetch_list_name.append(k) + + np_images = load_data(imgfile, input_shape) + results = exe.run(program=test_program, + feed={'image': np_images}, + fetch_list=fetch_list_var) + + if debug is True: + dump_path = 'results.layers' + dump_results(results, fetch_list_name, dump_path) + print('all results dumped to [%s]' % (dump_path)) + else: + result = results[0] + print('predicted class:', np.argmax(result)) + + +if __name__ == "__main__": + """ maybe more convenient to use 'run.sh' to call this tool + """ + net_file = 'models/resnet50/resnet50.py' + weight_file = 'models/resnet50/resnet50.npy' + imgfile = 'data/65.jpeg' + net_name = 'ResNet50' + + argc = len(sys.argv) + if argc == 5: + net_file = sys.argv[1] + weight_file = sys.argv[2] + imgfile = sys.argv[3] + net_name = sys.argv[4] + elif argc > 1: + print('usage:') + print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' % + (sys.argv[0])) + print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file, + weight_file, imgfile, net_name)) + sys.exit(1) + + infer(net_file, net_name, weight_file, imgfile) diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7a1a5ebd7c0a5090c00a0c8ca6b0e11b110967dc --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +#function: +# a tool used to: +# 1, convert a caffe model +# 2, do inference using this model +# +#usage: +# bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50 +# + +#set -x +if [[ $# -lt 3 ]];then + echo "usage:" + echo " bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]" + echo " eg: bash $0 resnet50 ./models.caffe/resnet50 ./models/resnet50" + exit 1 +else + model_name=$1 + cf_model_path=$2 + pd_model_path=$3 + only_convert=$4 +fi + +proto_file=$cf_model_path/${model_name}.prototxt +caffemodel_file=$cf_model_path/${model_name}.caffemodel +weight_file=$pd_model_path/${model_name}.npy +net_file=$pd_model_path/${model_name}.py + +if [[ ! -e $proto_file ]];then + echo "not found prototxt[$proto_file]" + exit 1 +fi + +if [[ ! -e $caffemodel_file ]];then + echo "not found caffemodel[$caffemodel_file]" + exit 1 +fi + +if [[ ! -e $pd_model_path ]];then + mkdir $pd_model_path +fi + +PYTHON=`which cfpython` +if [[ -z $PYTHON ]];then + PYTHON=`which python` +fi +$PYTHON ../../convert.py \ + $proto_file \ + --caffemodel $caffemodel_file \ + --data-output-path $weight_file\ + --code-output-path $net_file + +ret=$? +if [[ $ret -ne 0 ]];then + echo "failed to convert caffe model[$cf_model_path]" + exit $ret +else + echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]" +fi + +if [[ -z $only_convert ]];then + PYTHON=`which pdpython` + if [[ -z $PYTHON ]];then + PYTHON=`which python` + fi + imgfile="data/65.jpeg" + net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'` + $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name + ret=$? +fi +exit $ret diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/README.md b/fluid/image_classification/caffe2fluid/examples/mnist/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cd427d632737c8988403f987d86c159500022198 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/mnist/README.md @@ -0,0 +1,10 @@ +a demo to show converting caffe model on 'mnist' using caffe2fluid + +--- + +# How to use + +1. prepare python environment +2. download caffe model to "models.caffe/lenet" which contains "lenet.caffemodel" and "lenet.prototxt" +3. run the tool + eg: bash ./run.sh lenet ./models.caffe/lenet ./models/lenet diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/predict.py b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py similarity index 66% rename from fluid/image_classification/caffe2fluid/tests/lenet/predict.py rename to fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py index 7405cc6f848ea139bc4edd4c3ec0e0af773ea25a..5c86635d5a014262bdec40fe063915350c5fadb3 100644 --- a/fluid/image_classification/caffe2fluid/tests/lenet/predict.py +++ b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py @@ -4,12 +4,12 @@ # demo to show how to use converted model using caffe2fluid # +import sys +import os import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid -from lenet import LeNet as MyNet - def test_model(exe, test_program, fetch_list, test_reader, feeder): acc_set = [] @@ -24,10 +24,15 @@ def test_model(exe, test_program, fetch_list, test_reader, feeder): return float(acc_val) -def main(model_path): +def evaluate(net_file, model_file): """ main """ - print('load fluid model in %s' % (model_path)) + #1, build model + net_path = os.path.dirname(net_file) + if net_path not in sys.path: + sys.path.insert(0, net_path) + + from lenet import LeNet as MyNet with_gpu = False paddle.init(use_gpu=with_gpu) @@ -45,10 +50,10 @@ def main(model_path): exe.run(fluid.default_startup_program()) #2, load weights - if model_path.find('.npy') > 0: - net.load(data_path=model_path, exe=exe, place=place) + if model_file.find('.npy') > 0: + net.load(data_path=model_file, exe=exe, place=place) else: - net.load(data_path=model_path, exe=exe) + net.load(data_path=model_file, exe=exe) #3, test this model test_program = fluid.default_main_program().clone() @@ -65,10 +70,17 @@ def main(model_path): if __name__ == "__main__": - import sys - if len(sys.argv) == 2: - fluid_model_path = sys.argv[1] - else: - fluid_model_path = './model.fluid' - - main(fluid_model_path) + net_file = 'models/lenet/lenet.py' + weight_file = 'models/lenet/lenet.npy' + + argc = len(sys.argv) + if argc == 3: + net_file = sys.argv[1] + weight_file = sys.argv[2] + elif argc > 1: + print('usage:') + print('\tpython %s [net_file] [weight_file]' % (sys.argv[0])) + print('\teg:python %s %s %s %s' % (sys.argv[0], net_file, weight_file)) + sys.exit(1) + + evaluate(net_file, weight_file) diff --git a/fluid/image_classification/caffe2fluid/examples/mnist/run.sh b/fluid/image_classification/caffe2fluid/examples/mnist/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..eee83ef7cefd594c62fd95db525f081a27c6ea38 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/mnist/run.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +#function: +# a tool used to: +# 1, convert a caffe model +# 2, do inference using this model +# +#usage: +# bash run.sh lenet ./models.caffe/lenet ./models/lenet +# + +#set -x +if [[ $# -lt 3 ]];then + echo "usage:" + echo " bash $0 [model_name] [cf_model_path] [pd_model_path] [only_convert]" + echo " eg: bash $0 lenet ./models.caffe/lenet ./models/lenet" + exit 1 +else + model_name=$1 + cf_model_path=$2 + pd_model_path=$3 + no_eval=$4 +fi + +proto_file=$cf_model_path/${model_name}.prototxt +caffemodel_file=$cf_model_path/${model_name}.caffemodel +weight_file=$pd_model_path/${model_name}.npy +net_file=$pd_model_path/${model_name}.py + +if [[ ! -e $proto_file ]];then + echo "not found prototxt[$proto_file]" + exit 1 +fi + +if [[ ! -e $caffemodel_file ]];then + echo "not found caffemodel[$caffemodel_file]" + exit 1 +fi + +if [[ ! -e $pd_model_path ]];then + mkdir $pd_model_path +fi + +PYTHON=`which cfpython` +if [[ -z $PYTHON ]];then + PYTHON=`which python` +fi +$PYTHON ../../convert.py \ + $proto_file \ + --caffemodel $caffemodel_file \ + --data-output-path $weight_file\ + --code-output-path $net_file + +ret=$? +if [[ $ret -ne 0 ]];then + echo "failed to convert caffe model[$cf_model_path]" + exit $ret +else + echo "succeed to convert caffe model[$cf_model_path] to fluid model[$pd_model_path]" +fi + +if [[ -z $only_convert ]];then + PYTHON=`which pdpython` + if [[ -z $PYTHON ]];then + PYTHON=`which python` + fi + net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'` + if [[ $net_name != "LeNet" ]];then + echo "only support LeNet" + exit 1 + fi + $PYTHON ./evaluate.py $net_file $weight_file + ret=$? +fi +exit $ret diff --git a/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py index 5fbd48d3ade5ab4b812210acf82be625871740cb..6ad7767ed8a88f1c0258ad36cc35221c33b641e5 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py +++ b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py @@ -54,7 +54,6 @@ def show_fallback_warning(): WARNING: PyCaffe not found! Falling back to a pure protocol buffer implementation. * Conversions will be drastically slower. - * This backend is UNTESTED! ------------------------------------------------------------ ''' diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py index cb751dffa1ca9cc19214bed12681312942046df6..5387f441852b8a318a41898ee0b62b4903ccdabb 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/graph.py +++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py @@ -175,6 +175,7 @@ class GraphBuilder(object): kind = NodeKind.map_raw_kind(layer.type) if kind is None: raise KaffeError('Unknown layer type encountered: %s' % layer.type) + # We want to use the layer's top names (the "output" names), rather than the # name attribute, which is more of readability thing than a functional one. # Other layers will refer to a node by its "top name". @@ -235,6 +236,7 @@ class GraphBuilder(object): node.add_parent(parent_node) if len(layer.top) > 1: raise KaffeError('Multiple top nodes are not supported.') + for output_name in layer.top: if output_name == layer.name: # Output is named the same as the node. No further action required. diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py index 6be35ed727fed76a1c96017455bdaa354ace9f97..f263407ab41458573f2df775f99202bed0e9d894 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/layers.py +++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py @@ -51,17 +51,79 @@ LAYER_DESCRIPTORS = { 'Threshold': shape_identity, } -LAYER_TYPES = LAYER_DESCRIPTORS.keys() +# layer types in 'V1LayerParameter' +# (v1layertype name, enum value, mapped to layer type) +v1_layertypes = [ + ('ABSVAL', 35), + ('ACCURACY', 1), + ('ARGMAX', 30), + ('BNLL', 2), + ('CONCAT', 3), + ('CONVOLUTION', 4), + ('DATA', 5), + ('DECONVOLUTION', 39), + ('DROPOUT', 6), + ('ELTWISE', 25), + ('EXP', 38), + ('FLATTEN', 8), + ('IM2COL', 11), + ('INNERPRODUCT', 14), + ('LRN', 15), + ('MEMORYDATA', 29), + ('MULTINOMIALLOGISTICLOSS', 16), + ('MVN', 34), + ('POOLING', 17), + ('POWER', 26), + ('RELU', 18), + ('SIGMOID', 19), + ('SIGMOIDCROSSENTROPYLOSS', 27), + ('SILENCE', 36), + ('SOFTMAX', 20), + ('SPLIT', 22), + ('SLICE', 33), + ('TANH', 23), + ('WINDOWDATA', 24), + ('THRESHOLD', 31), +] +LAYER_TYPES = LAYER_DESCRIPTORS.keys() LayerType = type('LayerType', (), {t: t for t in LAYER_TYPES}) +#map the layer name in V1 to standard name +V1_LAYER_MAP = {'_not_init_': True} + + +def get_v1_layer_map(): + global V1_LAYER_MAP + if '_not_init_' not in V1_LAYER_MAP: + return V1_LAYER_MAP + else: + del V1_LAYER_MAP['_not_init_'] + + name2layer = {} + for n in LAYER_TYPES: + name2layer[n.upper()] = n + + for l in v1_layertypes: + n, v = l + if n in name2layer and v not in V1_LAYER_MAP: + V1_LAYER_MAP[v] = name2layer[n] + else: + raise KaffeError('not found v1 layer type %s' % n) + return V1_LAYER_MAP + class NodeKind(LayerType): @staticmethod def map_raw_kind(kind): if kind in LAYER_TYPES: return kind - return None + + v1_layers = get_v1_layer_map() + if kind in v1_layers: + return v1_layers[kind] + else: + return None @staticmethod def compute_output_shape(node): diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py index 620a84e8f1289672151f1f280559a56b37995ce0..fd6a71cb6acbfffe2aed1d3680fb91c8c85dc3d3 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py @@ -27,6 +27,9 @@ def layer(op): self.layers[name] = layer_output # This output is now the input for the next layer. self.feed(layer_output) + #print('output shape of %s:' % (name)) + #print layer_output.shape + # Return self for chained calls. return self @@ -158,41 +161,64 @@ class Network(object): output = fluid.layers.relu(x=input) return output - @layer - def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): - if padding is None: - padding = [0, 0] + def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw): + #adjust the padding if needed + i_h, i_w = i_hw + k_h, k_w = k_hw + s_h, s_w = s_hw + p_h, p_w = p_hw + + def is_consistent(i, k, s, p): + o = i + 2 * p - k + if o % s == 0: + return True + else: + return False + + real_p_h = 0 + real_p_w = 0 + if is_consistent(i_h, k_h, s_h, p_h) is False: + real_p_h = int(k_h / 2) + if is_consistent(i_w, k_w, s_w, p_w) is False: + real_p_w = int(k_w / 2) + + return [real_p_h, real_p_w] + + def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding): # Get the number of channels in the input - h_i, w_i = input.shape[2:] - fluid = import_fluid() - output = fluid.layers.pool2d( - input=input, - pool_size=[k_h, k_w], - pool_stride=[s_h, s_w], - pool_padding=padding, - pool_type='max') - return output + in_hw = input.shape[2:] + k_hw = [k_h, k_w] + s_hw = [s_h, s_w] - @layer - def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): if padding is None: - padding = [0, 0] + #fix bug about the difference between conv and pool + #more info: https://github.com/BVLC/caffe/issues/1318 + padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0]) - # Get the number of channels in the input - h_i, w_i = input.shape[2:] fluid = import_fluid() output = fluid.layers.pool2d( input=input, - pool_size=[k_h, k_w], - pool_stride=[s_h, s_w], + pool_size=k_hw, + pool_stride=s_hw, pool_padding=padding, - pool_type='avg') + pool_type=pool_type) return output + @layer + def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): + return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding) + + @layer + def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): + return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding) + @layer def lrn(self, input, radius, alpha, beta, name, bias=1.0): - raise Exception('lrn() not implemented yet') + fluid = import_fluid() + output = fluid.layers.lrn(input=input, \ + n=radius, k=bias, alpha=alpha, beta=beta, name=name) + return output @layer def concat(self, inputs, axis, name): @@ -228,7 +254,7 @@ class Network(object): @layer def softmax(self, input, name): fluid = import_fluid() - output = fluid.layers.softmax(x=input, name=name) + output = fluid.layers.softmax(input) return output @layer @@ -256,5 +282,8 @@ class Network(object): return output @layer - def dropout(self, input, keep_prob, name): - raise Exception('dropout() not implemented yet') + def dropout(self, input, drop_prob, name, is_test=True): + fluid = import_fluid() + output = fluid.layers.dropout( + input, dropout_prob=drop_prob, is_test=is_test, name=name) + return output diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py index 92b9d32a3a755d8e6a2a8739cc3f42f9c8564b40..4d7ec49a39199bb1415f830d88f89e93a4b95266 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py @@ -132,8 +132,7 @@ class TensorFlowMapper(NodeMapper): # just scales by alpha (as does Krizhevsky's paper). # We'll account for that here. alpha = params.alpha / float(params.local_size) - return TensorFlowNode('lrn', - int(params.local_size / 2), alpha, params.beta) + return TensorFlowNode('lrn', params.local_size, alpha, params.beta) def map_concat(self, node): return TensorFlowNode('concat', node.parameters.axis) @@ -191,22 +190,33 @@ class TensorFlowEmitter(object): def emit_setup_def(self): return self.statement('def setup(self):') - def emit_convert_def(self, input_nodes): - def data_layer_def(name, shape, dtype=None): - if dtype is None: - dtype = 'float32' + def emit_shape_def(self, input_nodes): + self.outdent() + func_def = self.statement('@classmethod') + func_def += self.statement('def input_shapes(cls):') + self.indent() - layer_var = name + '_layer' - shape = [str(s) for s in shape[1:]] - layer_def = '%s = fluid.layers.data(name="%s", shape=[%s], dtype="%s")'\ - % (layer_var, name, ','.join(shape), dtype) - return layer_var, layer_def + input_shapes = {} + for n in input_nodes: + name = n.name + output_shape = n.output_shape + shape = [str(s) for s in output_shape[1:]] + input_shapes[name] = ', '.join(shape) + input_shapes = ['"%s": [%s]' % (n, l) for n, l in input_shapes.items()] + shape_str = ','.join(input_shapes) + func_def += self.statement('return {%s}' % (shape_str)) + return '\n\n' + func_def + def emit_convert_def(self, input_nodes): codes = [] inputs = {} + codes.append('shapes = cls.input_shapes()') for n in input_nodes: name = n.name - layer_var, layer_def = data_layer_def(n.name, n.output_shape) + layer_var = name + '_layer' + layer_def = '%s = fluid.layers.data(name="%s", shape=shapes["%s"],'\ + ' dtype="float32")' % (layer_var, name, name) + #layer_var, layer_def = data_layer_def(n.name, n.output_shape) codes.append(layer_def) inputs[name] = layer_var @@ -229,7 +239,7 @@ class TensorFlowEmitter(object): func_def += self.statement('import paddle.v2.fluid as fluid') for l in codes: func_def += self.statement(l) - return '\n\n' + func_def + return '\n' + func_def def emit_main_def(self, name): if name is None: @@ -273,6 +283,7 @@ class TensorFlowEmitter(object): b += self.emit_node(node) blocks.append(b[:-1]) s = s + '\n\n'.join(blocks) + s += self.emit_shape_def(input_nodes) s += self.emit_convert_def(input_nodes) s += self.emit_main_def(name) return s diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/README.md b/fluid/image_classification/caffe2fluid/tests/lenet/README.md deleted file mode 100644 index 982edc2aa67f43f849bb2523b1a15edaa02f5d28..0000000000000000000000000000000000000000 --- a/fluid/image_classification/caffe2fluid/tests/lenet/README.md +++ /dev/null @@ -1,28 +0,0 @@ -### Convert lenet model from caffe format into paddle format(fluid api) - -### Howto -1, Prepare your caffepb.py - -2, Download a lenet caffe-model - lenet_iter_10000.caffemodel - download address: https://github.com/ethereon/caffe-tensorflow/raw/master/examples/mnist/lenet_iter_10000.caffemodel - md5: cbec75c1c374b6c1981c4a1eb024ae01 - - lenet.prototxt - download address: https://raw.githubusercontent.com/BVLC/caffe/master/examples/mnist/lenet.prototxt - md5: 27384af843338ab90b00c8d1c81de7d5 - - -2, Convert this model(make sure caffepb.py is ready in ../../proto) - convert to npy format - bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy - - save to fluid format(optional) - bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy && python ./lenet.py ./lenet.npy ./fluid.model - -4, Use this new model(paddle installed in this python) - use fluid format - python ./predict.py ./fluid.model - - use npy format - python ./predict.py ./lenet.npy diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh b/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh deleted file mode 100755 index b3ec1a1dce2434a4466cf5d4609de1b4aec9d346..0000000000000000000000000000000000000000 --- a/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -#function: -# convert a caffe model -# eg: -# bash ./convert.sh ./model.caffe/lenet.prototxt ./model.caffe/lenet.caffemodel lenet.py lenet.npy - -if [[ $# -ne 4 ]];then - echo "usage:" - echo " bash $0 [PROTOTXT] [CAFFEMODEL] [PY_NAME] [WEIGHT_NAME]" - echo " eg: bash $0 lenet.prototxt lenet.caffemodel lenet.py lenet.npy" - exit 1 -fi - -WORK_ROOT=$(dirname `readlink -f ${BASH_SOURCE[0]}`) -if [[ -z $PYTHON ]];then - PYTHON=`which python` -fi - -PROTOTXT=$1 -CAFFEMODEL=$2 -PY_NAME=$3 -WEIGHT_NAME=$4 -CONVERTER_PY="$WORK_ROOT/../../convert.py" - -$PYTHON $CONVERTER_PY $PROTOTXT --caffemodel $CAFFEMODEL --code-output-path=$PY_NAME --data-output-path=$WEIGHT_NAME -ret=$? -if [[ $ret -eq 0 ]];then - echo "succeed to convert caffe model[$CAFFEMODEL, $PROTOTXT] to paddle model[$PY_NAME, $WEIGHT_NAME]" -else - echo "failed to convert caffe model[$CAFFEMODEL, $PROTOTXT]" -fi -exit $ret diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy deleted file mode 100644 index 66f773e5ffd54c8f5151b920aecdf3dd4f8c91d2..0000000000000000000000000000000000000000 Binary files a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy and /dev/null differ diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py deleted file mode 100644 index 50e6927483a61c574f1152c6dc438a6b2c8a4d90..0000000000000000000000000000000000000000 --- a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py +++ /dev/null @@ -1,297 +0,0 @@ -### generated by caffe2fluid, your net is in class "LeNet" ### - -import math -import os -import numpy as np - - -def import_fluid(): - import paddle.v2.fluid as fluid - return fluid - - -def layer(op): - '''Decorator for composable network layers.''' - - def layer_decorated(self, *args, **kwargs): - # Automatically set a name if not provided. - name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) - # Figure out the layer inputs. - if len(self.terminals) == 0: - raise RuntimeError('No input variables found for layer %s.' % name) - elif len(self.terminals) == 1: - layer_input = self.terminals[0] - else: - layer_input = list(self.terminals) - # Perform the operation and get the output. - layer_output = op(self, layer_input, *args, **kwargs) - # Add to layer LUT. - self.layers[name] = layer_output - # This output is now the input for the next layer. - self.feed(layer_output) - # Return self for chained calls. - return self - - return layer_decorated - - -class Network(object): - def __init__(self, inputs, trainable=True): - # The input nodes for this network - self.inputs = inputs - # The current list of terminal nodes - self.terminals = [] - # Mapping from layer names to layers - self.layers = dict(inputs) - # If true, the resulting variables are set as trainable - self.trainable = trainable - # Switch variable for dropout - self.paddle_env = None - self.setup() - - def setup(self): - '''Construct the network. ''' - raise NotImplementedError('Must be implemented by the subclass.') - - def load(self, data_path, exe=None, place=None, ignore_missing=False): - '''Load network weights. - data_path: The path to the numpy-serialized network weights - ignore_missing: If true, serialized weights for missing layers are ignored. - ''' - fluid = import_fluid() - #load fluid mode directly - if os.path.isdir(data_path): - assert (exe is not None), \ - 'must provide a executor to load fluid model' - fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path) - return True - - #load model from a npy file - if exe is None or place is None: - if self.paddle_env is None: - place = fluid.CPUPlace() - exe = fluid.Executor(place) - self.paddle_env = {'place': place, 'exe': exe} - exe = exe.run(fluid.default_startup_program()) - else: - place = self.paddle_env['place'] - exe = self.paddle_env['exe'] - - data_dict = np.load(data_path).item() - for op_name in data_dict: - layer = self.layers[op_name] - for param_name, data in data_dict[op_name].iteritems(): - try: - name = '%s_%s' % (op_name, param_name) - v = fluid.global_scope().find_var(name) - w = v.get_tensor() - w.set(data, place) - except ValueError: - if not ignore_missing: - raise - return True - - def feed(self, *args): - '''Set the input(s) for the next operation by replacing the terminal nodes. - The arguments can be either layer names or the actual layers. - ''' - assert len(args) != 0 - self.terminals = [] - for fed_layer in args: - if isinstance(fed_layer, basestring): - try: - fed_layer = self.layers[fed_layer] - except KeyError: - raise KeyError('Unknown layer name fed: %s' % fed_layer) - self.terminals.append(fed_layer) - return self - - def get_output(self): - '''Returns the current network output.''' - return self.terminals[-1] - - def get_unique_name(self, prefix): - '''Returns an index-suffixed unique name for the given prefix. - This is used for auto-generating layer names based on the type-prefix. - ''' - ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 - return '%s_%d' % (prefix, ident) - - @layer - def conv(self, - input, - k_h, - k_w, - c_o, - s_h, - s_w, - name, - relu=True, - padding=None, - group=1, - biased=True): - if padding is None: - padding = [0, 0] - - # Get the number of channels in the input - c_i, h_i, w_i = input.shape[1:] - - # Verify that the grouping parameter is valid - assert c_i % group == 0 - assert c_o % group == 0 - - fluid = import_fluid() - prefix = name + '_' - output = fluid.layers.conv2d( - input=input, - filter_size=[k_h, k_w], - num_filters=c_o, - stride=[s_h, s_w], - padding=padding, - groups=group, - param_attr=fluid.ParamAttr(name=prefix + "weights"), - bias_attr=fluid.ParamAttr(name=prefix + "biases"), - act="relu" if relu is True else None) - return output - - @layer - def relu(self, input, name): - fluid = import_fluid() - output = fluid.layers.relu(x=input) - return output - - @layer - def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): - if padding is None: - padding = [0, 0] - - # Get the number of channels in the input - h_i, w_i = input.shape[2:] - fluid = import_fluid() - output = fluid.layers.pool2d( - input=input, - pool_size=[k_h, k_w], - pool_stride=[s_h, s_w], - pool_padding=padding, - pool_type='max') - return output - - @layer - def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): - if padding is None: - padding = [0, 0] - - # Get the number of channels in the input - h_i, w_i = input.shape[2:] - fluid = import_fluid() - output = fluid.layers.pool2d( - input=input, - pool_size=[k_h, k_w], - pool_stride=[s_h, s_w], - pool_padding=padding, - pool_type='avg') - return output - - @layer - def lrn(self, input, radius, alpha, beta, name, bias=1.0): - raise Exception('lrn() not implemented yet') - - @layer - def concat(self, inputs, axis, name): - fluid = import_fluid() - output = fluid.layers.concat(input=inputs, axis=axis) - return output - - @layer - def add(self, inputs, name): - fluid = import_fluid() - output = inputs[0] - for i in inputs[1:]: - output = fluid.layers.elementwise_add(x=output, y=i) - return output - - @layer - def fc(self, input, num_out, name, relu=True, act=None): - fluid = import_fluid() - - if act is None: - act = 'relu' if relu is True else None - - prefix = name + '_' - output = fluid.layers.fc( - name=name, - input=input, - size=num_out, - act=act, - param_attr=fluid.ParamAttr(name=prefix + 'weights'), - bias_attr=fluid.ParamAttr(name=prefix + 'biases')) - return output - - @layer - def softmax(self, input, name): - fluid = import_fluid() - output = fluid.layers.softmax(x=input, name=name) - return output - - @layer - def batch_normalization(self, input, name, scale_offset=True, relu=False): - # NOTE: Currently, only inference is supported - fluid = import_fluid() - prefix = name + '_' - param_attr = None if scale_offset is False else fluid.ParamAttr( - name=prefix + 'scale') - bias_attr = None if scale_offset is False else fluid.ParamAttr( - name=prefix + 'offset') - mean_name = prefix + 'mean' - variance_name = prefix + 'variance' - output = fluid.layers.batch_norm( - name=name, - input=input, - is_test=True, - param_attr=param_attr, - bias_attr=bias_attr, - moving_mean_name=mean_name, - moving_variance_name=variance_name, - epsilon=1e-5, - act='relu' if relu is True else None) - - return output - - @layer - def dropout(self, input, keep_prob, name): - raise Exception('dropout() not implemented yet') - - -class LeNet(Network): - def setup(self): - self.feed('data') - self.conv(5, 5, 20, 1, 1, relu=False, name='conv1') - self.max_pool(2, 2, 2, 2, name='pool1') - self.conv(5, 5, 50, 1, 1, relu=False, name='conv2') - self.max_pool(2, 2, 2, 2, name='pool2') - self.fc(500, name='ip1') - self.fc(10, relu=False, name='ip2') - self.softmax(name='prob') - - @classmethod - def convert(cls, npy_model, fluid_path): - import paddle.v2.fluid as fluid - data_layer = fluid.layers.data( - name="data", shape=[1, 28, 28], dtype="float32") - feed_data = {"data": data_layer} - net = cls(feed_data) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - net.load(data_path=npy_model, exe=exe, place=place) - fluid.io.save_persistables(executor=exe, dirname=fluid_path) - - -if __name__ == "__main__": - #usage: python xxxnet.py xxx.npy ./model - - import sys - npy_weight = sys.argv[1] - fluid_model = sys.argv[2] - LeNet.convert(npy_weight, fluid_model) - exit(0) diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py index c2b2d680fc995b1ea6cc5a2f640746a8a79ac029..b1adf0baba8a987ae1a971e148375c6a0730d860 100644 --- a/fluid/image_classification/se_resnext.py +++ b/fluid/image_classification/se_resnext.py @@ -1,4 +1,7 @@ import os +import numpy as np +import time +import sys import paddle.v2 as paddle import paddle.fluid as fluid import reader @@ -65,20 +68,44 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -def SE_ResNeXt(input, class_dim, infer=False): - cardinality = 64 - reduction_ratio = 16 - depth = [3, 8, 36, 3] - num_filters = [128, 256, 512, 1024] +def SE_ResNeXt(input, class_dim, infer=False, layers=50): + supported_layers = [50, 152] + if layers not in supported_layers: + print("supported layers are", supported_layers, "but input layer is", + layers) + exit() + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] - conv = conv_bn_layer( - input=input, num_filters=64, filter_size=3, stride=2, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=64, filter_size=3, stride=1, act='relu') - conv = conv_bn_layer( - input=conv, num_filters=128, filter_size=3, stride=1, act='relu') - conv = fluid.layers.pool2d( - input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + conv = conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + conv = conv_bn_layer( + input=input, num_filters=64, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=128, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') for block in range(len(depth)): for i in range(depth[block]): @@ -104,7 +131,10 @@ def train(learning_rate, num_passes, init_model=None, model_save_dir='model', - parallel=True): + parallel=True, + use_nccl=True, + lr_strategy=None, + layers=50): class_dim = 1000 image_shape = [3, 224, 224] @@ -113,36 +143,52 @@ def train(learning_rate, if parallel: places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) + pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) with pd.do(): image_ = pd.read_input(image) label_ = pd.read_input(label) - out = SE_ResNeXt(input=image_, class_dim=class_dim) + out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers) cost = fluid.layers.cross_entropy(input=out, label=label_) avg_cost = fluid.layers.mean(x=cost) - accuracy = fluid.layers.accuracy(input=out, label=label_) + acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5) pd.write_output(avg_cost) - pd.write_output(accuracy) + pd.write_output(acc_top1) + pd.write_output(acc_top5) - avg_cost, accuracy = pd() + avg_cost, acc_top1, acc_top5 = pd() avg_cost = fluid.layers.mean(x=avg_cost) - accuracy = fluid.layers.mean(x=accuracy) + acc_top1 = fluid.layers.mean(x=acc_top1) + acc_top5 = fluid.layers.mean(x=acc_top5) else: - out = SE_ResNeXt(input=image, class_dim=class_dim) + out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) - accuracy = fluid.layers.accuracy(input=out, label=label) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + if lr_strategy is None: + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + bd = lr_strategy["bd"] + lr = lr_strategy["lr"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer = fluid.optimizer.Momentum( - learning_rate=learning_rate, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) opts = optimizer.minimize(avg_cost) + fluid.memory_optimize(fluid.default_main_program()) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program([avg_cost, accuracy]) + inference_program = fluid.io.get_inference_program( + [avg_cost, acc_top1, acc_top5]) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) @@ -156,34 +202,86 @@ def train(learning_rate, feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) for pass_id in range(num_passes): + train_info = [[], [], []] + test_info = [[], [], []] for batch_id, data in enumerate(train_reader()): - loss = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost]) - print("Pass {0}, batch {1}, loss {2}".format(pass_id, batch_id, - float(loss[0]))) - - total_loss = 0.0 - total_acc = 0.0 - total_batch = 0 + t1 = time.time() + loss, acc1, acc5 = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + train_info[0].append(loss[0]) + train_info[1].append(acc1[0]) + train_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, \ + acc1 {3}, acc5 {4} time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + train_loss = np.array(train_info[0]).mean() + train_acc1 = np.array(train_info[1]).mean() + train_acc5 = np.array(train_info[2]).mean() for data in test_reader(): - loss, acc = exe.run(inference_program, - feed=feeder.feed(data), - fetch_list=[avg_cost, accuracy]) - total_loss += float(loss) - total_acc += float(acc) - total_batch += 1 - print("End pass {0}, test_loss {1}, test_acc {2}".format( - pass_id, total_loss / total_batch, total_acc / total_batch)) + t1 = time.time() + loss, acc1, acc5 = exe.run( + inference_program, + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + test_info[0].append(loss[0]) + test_info[1].append(acc1[0]) + test_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0},testbatch {1},loss {2}, \ + acc1 {3},acc5 {4},time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + test_loss = np.array(test_info[0]).mean() + test_acc1 = np.array(test_info[1]).mean() + test_acc5 = np.array(test_info[2]).mean() + + print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ + test_loss {4}, test_acc1 {5}, test_acc5 {6}" + .format(pass_id, \ + train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ + test_acc5)) + sys.stdout.flush() model_path = os.path.join(model_save_dir, str(pass_id)) - fluid.io.save_inference_model(model_path, ['image'], [out], exe) + if not os.path.isdir(model_path): + os.makedirs(model_path) + fluid.io.save_persistables(exe, model_path) if __name__ == '__main__': + epoch_points = [30, 60, 90] + total_images = 1281167 + batch_size = 256 + step = int(total_images / batch_size + 1) + bd = [e * step for e in epoch_points] + lr = [0.1, 0.01, 0.001, 0.0001] + + lr_strategy = {"bd": bd, "lr": lr} + + use_nccl = True + # layers: 50, 152 + layers = 50 + train( learning_rate=0.1, - batch_size=8, - num_passes=100, + batch_size=batch_size, + num_passes=120, init_model=None, - parallel=False) + parallel=True, + use_nccl=True, + lr_strategy=lr_strategy, + layers=layers) diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py index b91a8672629e76557a005897f5fe044074633028..29d7c4868a61e489659c2ed04f0ff09ff48b52c8 100644 --- a/fluid/neural_machine_translation/transformer/config.py +++ b/fluid/neural_machine_translation/transformer/config.py @@ -22,7 +22,7 @@ class TrainTaskConfig(object): class InferTaskConfig(object): use_gpu = False # the number of examples in one run for sequence generation. - batch_size = 10 + batch_size = 1 # the parameters for beam search. beam_size = 5 @@ -92,7 +92,9 @@ encoder_input_data_names = ( "src_word", "src_pos", "src_slf_attn_bias", - "src_data_shape", ) + "src_data_shape", + "src_slf_attn_pre_softmax_shape", + "src_slf_attn_post_softmax_shape", ) # Names of all data layers in decoder listed in order. decoder_input_data_names = ( @@ -101,6 +103,10 @@ decoder_input_data_names = ( "trg_slf_attn_bias", "trg_src_attn_bias", "trg_data_shape", + "trg_slf_attn_pre_softmax_shape", + "trg_slf_attn_post_softmax_shape", + "trg_src_attn_pre_softmax_shape", + "trg_src_attn_post_softmax_shape", "enc_output", ) # Names of label related data layers listed in order. diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py index 5d572c566cb67e0035ad3d2271706dea8d3f1d83..ee7d5d41c0b407e4e9b60fc028e42f256dfbdfd7 100644 --- a/fluid/neural_machine_translation/transformer/infer.py +++ b/fluid/neural_machine_translation/transformer/infer.py @@ -1,6 +1,6 @@ import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import model @@ -27,11 +27,19 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, is_target=False, return_pos=True, return_attn_bias=True, - return_max_len=True) - enc_in_data = enc_in_data[:-1] + [ + return_max_len=False) + # Append the data shape input to reshape the output of embedding layer. + enc_in_data = enc_in_data + [ np.array( - [batch_size, enc_in_data[-1], d_model], dtype="int32") - ] # Append the data shape input. + [-1, enc_in_data[2].shape[-1], d_model], dtype="int32") + ] + # Append the shape inputs to reshape before and after softmax in encoder + # self attention. + enc_in_data = enc_in_data + [ + np.array( + [-1, enc_in_data[2].shape[-1]], dtype="int32"), np.array( + enc_in_data[2].shape, dtype="int32") + ] enc_output = exe.run(encoder, feed=dict(zip(enc_in_names, enc_in_data)), fetch_list=enc_out_names)[0] @@ -73,8 +81,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, trg_words = np.array( [[bos_idx]] * batch_size * beam_size, dtype="int64") trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64") - src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[-1][ - 1], enc_in_data[-2], 1 + src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[2].shape[ + -1], enc_in_data[2], 1 # This is used to remove attention on subsequent words. trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len, trg_max_len)) @@ -89,19 +97,38 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, -1, src_slf_attn_bias.shape[1], trg_max_len, src_slf_attn_bias.shape[-1] ]) + # Append the shape input to reshape the output of embedding layer. trg_data_shape = np.array( [batch_size * beam_size, trg_max_len, d_model], dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # decoder self attention. + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + trg_slf_attn_bias.shape, dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # encoder-decoder attention. + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + trg_src_attn_bias.shape, dtype="int32") enc_output = np.tile( enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape( [-1, enc_output.shape[-2], enc_output.shape[-1]]) - return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output + return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map): """ Update the input data of decoder mainly by slicing from the previous input data and dropping the finished instance beams. """ - trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = dec_in_data + trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output = dec_in_data trg_cur_len = trg_slf_attn_bias.shape[-1] + 1 trg_words = np.array( [ @@ -129,11 +156,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, trg_src_attn_bias = np.tile(trg_src_attn_bias[ active_beams_indice, :, ::trg_src_attn_bias.shape[2], :], [1, 1, trg_cur_len, 1]) + # Append the shape input to reshape the output of embedding layer. trg_data_shape = np.array( [len(active_beams) * beam_size, trg_cur_len, d_model], dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # decoder self attention. + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + trg_slf_attn_bias.shape, dtype="int32") + # Append the shape inputs to reshape before and after softmax in + # encoder-decoder attention. + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + trg_src_attn_bias.shape, dtype="int32") enc_output = enc_output[active_beams_indice, :, :] - return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output + return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output) diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py index ff339c31e5b5ef3fa1ece8c9195ea7e866f14f98..deba195c34f6c71fb4a78f46b4e2678f67feaa75 100644 --- a/fluid/neural_machine_translation/transformer/model.py +++ b/fluid/neural_machine_translation/transformer/model.py @@ -29,7 +29,9 @@ def multi_head_attention(queries, d_value, d_model, n_head=1, - dropout_rate=0.): + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that @@ -109,21 +111,16 @@ def multi_head_attention(queries, """ Scaled Dot-Product Attention """ - - # FIXME(guosheng): Remove __softmax when softmax_op supporting high - # rank tensors. softmax_op only supports 2D tensor currently. - # Otherwise, add extra input data to reshape. - def __softmax(x, eps=1e-9): - exp_out = layers.exp(x=x) - sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) - return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) - scaled_q = layers.scale(x=q, scale=d_model**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - weights = __softmax( - layers.elementwise_add( - x=product, y=attn_bias) if attn_bias else product) - # weights = __softmax(product) + weights = layers.reshape( + x=layers.elementwise_add( + x=product, y=attn_bias) if attn_bias else product, + shape=[-1, product.shape[-1]], + actual_shape=pre_softmax_shape, + act="softmax") + weights = layers.reshape( + x=weights, shape=product.shape, actual_shape=post_softmax_shape) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False) @@ -248,7 +245,9 @@ def encoder_layer(enc_input, d_value, d_model, d_inner_hid, - dropout_rate=0.): + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): """The encoder layers that can be stacked to form a deep encoder. This module consits of a multi-head (self) attention followed by @@ -256,9 +255,9 @@ def encoder_layer(enc_input, with the post_process_layer to add residual connection, layer normalization and droput. """ - attn_output = multi_head_attention(enc_input, enc_input, enc_input, - attn_bias, d_key, d_value, d_model, - n_head, dropout_rate) + attn_output = multi_head_attention( + enc_input, enc_input, enc_input, attn_bias, d_key, d_value, d_model, + n_head, dropout_rate, pre_softmax_shape, post_softmax_shape) attn_output = post_process_layer(enc_input, attn_output, "dan", dropout_rate) ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) @@ -273,7 +272,9 @@ def encoder(enc_input, d_value, d_model, d_inner_hid, - dropout_rate=0.): + dropout_rate=0., + pre_softmax_shape=None, + post_softmax_shape=None): """ The encoder is composed of a stack of identical layers returned by calling encoder_layer. @@ -287,7 +288,9 @@ def encoder(enc_input, d_value, d_model, d_inner_hid, - dropout_rate, ) + dropout_rate, + pre_softmax_shape, + post_softmax_shape, ) enc_input = enc_output return enc_output @@ -301,7 +304,11 @@ def decoder_layer(dec_input, d_value, d_model, d_inner_hid, - dropout_rate=0.): + dropout_rate=0., + slf_attn_pre_softmax_shape=None, + slf_attn_post_softmax_shape=None, + src_attn_pre_softmax_shape=None, + src_attn_post_softmax_shape=None): """ The layer to be stacked in decoder part. The structure of this module is similar to that in the encoder part except @@ -316,7 +323,9 @@ def decoder_layer(dec_input, d_value, d_model, n_head, - dropout_rate, ) + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, ) slf_attn_output = post_process_layer( dec_input, slf_attn_output, @@ -331,7 +340,9 @@ def decoder_layer(dec_input, d_value, d_model, n_head, - dropout_rate, ) + dropout_rate, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) enc_attn_output = post_process_layer( slf_attn_output, enc_attn_output, @@ -359,7 +370,11 @@ def decoder(dec_input, d_value, d_model, d_inner_hid, - dropout_rate=0.): + dropout_rate=0., + slf_attn_pre_softmax_shape=None, + slf_attn_post_softmax_shape=None, + src_attn_pre_softmax_shape=None, + src_attn_post_softmax_shape=None): """ The decoder is composed of a stack of identical decoder_layer layers. """ @@ -374,7 +389,11 @@ def decoder(dec_input, d_value, d_model, d_inner_hid, - dropout_rate, ) + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) dec_input = dec_output return dec_output @@ -383,11 +402,13 @@ def make_inputs(input_data_names, n_head, d_model, max_length, - is_pos=True, - slf_attn_bias_flag=True, - src_attn_bias_flag=True, + is_pos, + slf_attn_bias_flag, + src_attn_bias_flag, enc_output_flag=False, - data_shape_flag=True): + data_shape_flag=True, + slf_attn_shape_flag=True, + src_attn_shape_flag=True): """ Define the input data layers for the transformer model. """ @@ -425,7 +446,8 @@ def make_inputs(input_data_names, append_batch_size=False) input_layers += [slf_attn_bias] if src_attn_bias_flag: - # This input is used to remove attention weights on paddings. + # This input is used to remove attention weights on paddings. It's used + # in encoder-decoder attention. # The actual data shape of slf_attn_bias_flag is: # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch] src_attn_bias = layers.data( @@ -435,13 +457,41 @@ def make_inputs(input_data_names, append_batch_size=False) input_layers += [src_attn_bias] if data_shape_flag: - # This input is used to reshape. + # This input is used to reshape the output of embedding layer. data_shape = layers.data( name=input_data_names[len(input_layers)], shape=[3], dtype="int32", append_batch_size=False) input_layers += [data_shape] + if slf_attn_shape_flag: + # This shape input is used to reshape before softmax in self attention. + slf_attn_pre_softmax_shape = layers.data( + name=input_data_names[len(input_layers)], + shape=[2], + dtype="int32", + append_batch_size=False) + input_layers += [slf_attn_pre_softmax_shape] + # This shape input is used to reshape after softmax in self attention. + slf_attn_post_softmax_shape = layers.data( + name=input_data_names[len(input_layers)], + shape=[4], + dtype="int32", + append_batch_size=False) + input_layers += [slf_attn_post_softmax_shape] + if src_attn_shape_flag: + src_attn_pre_softmax_shape = layers.data( + name=input_data_names[len(input_layers)], + shape=[2], + dtype="int32", + append_batch_size=False) + input_layers += [src_attn_pre_softmax_shape] + src_attn_post_softmax_shape = layers.data( + name=input_data_names[len(input_layers)], + shape=[4], + dtype="int32", + append_batch_size=False) + input_layers += [src_attn_post_softmax_shape] if enc_output_flag: # This input is used in independent decoder program for inference. # The actual data shape of slf_attn_bias_flag is: @@ -452,6 +502,7 @@ def make_inputs(input_data_names, dtype="float32", append_batch_size=False) input_layers += [enc_output] + return input_layers @@ -469,8 +520,18 @@ def transformer( src_pad_idx, trg_pad_idx, pos_pad_idx, ): - enc_inputs = make_inputs(encoder_input_data_names, n_head, d_model, - max_length, True, True, False) + enc_inputs = make_inputs( + encoder_input_data_names, + n_head, + d_model, + max_length, + is_pos=True, + slf_attn_bias_flag=True, + src_attn_bias_flag=False, + enc_output_flag=False, + data_shape_flag=True, + slf_attn_shape_flag=True, + src_attn_shape_flag=False) enc_output = wrap_encoder( src_vocab_size, @@ -486,8 +547,18 @@ def transformer( pos_pad_idx, enc_inputs, ) - dec_inputs = make_inputs(decoder_input_data_names, n_head, d_model, - max_length, True, True, True) + dec_inputs = make_inputs( + decoder_input_data_names, + n_head, + d_model, + max_length, + is_pos=True, + slf_attn_bias_flag=True, + src_attn_bias_flag=True, + enc_output_flag=False, + data_shape_flag=True, + slf_attn_shape_flag=True, + src_attn_shape_flag=True) predict = wrap_decoder( trg_vocab_size, @@ -506,9 +577,19 @@ def transformer( # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. - gold, weights = make_inputs(label_data_names, n_head, d_model, max_length, - False, False, False, False, False) - cost = layers.cross_entropy(input=predict, label=gold) + gold, weights = make_inputs( + label_data_names, + n_head, + d_model, + max_length, + is_pos=False, + slf_attn_bias_flag=False, + src_attn_bias_flag=False, + enc_output_flag=False, + data_shape_flag=False, + slf_attn_shape_flag=False, + src_attn_shape_flag=False) + cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) weighted_cost = cost * weights return layers.reduce_sum(weighted_cost), predict @@ -530,12 +611,24 @@ def wrap_encoder(src_vocab_size, """ if enc_inputs is None: # This is used to implement independent encoder program in inference. - src_word, src_pos, src_slf_attn_bias, src_data_shape = make_inputs( - encoder_input_data_names, n_head, d_model, max_length, True, True, - False) + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + make_inputs( + encoder_input_data_names, + n_head, + d_model, + max_length, + is_pos=True, + slf_attn_bias_flag=True, + src_attn_bias_flag=False, + enc_output_flag=False, + data_shape_flag=True, + slf_attn_shape_flag=True, + src_attn_shape_flag=False) else: - src_word, src_pos, src_slf_attn_bias, src_data_shape = enc_inputs - + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + enc_inputs enc_input = prepare_encoder( src_word, src_pos, @@ -555,7 +648,9 @@ def wrap_encoder(src_vocab_size, d_value, d_model, d_inner_hid, - dropout_rate, ) + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, ) return enc_output @@ -577,11 +672,26 @@ def wrap_decoder(trg_vocab_size, """ if dec_inputs is None: # This is used to implement independent decoder program in inference. - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, enc_output = make_inputs( - decoder_input_data_names, n_head, d_model, max_length, True, True, - True, True) + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape, enc_output = make_inputs( + decoder_input_data_names, + n_head, + d_model, + max_length, + is_pos=True, + slf_attn_bias_flag=True, + src_attn_bias_flag=True, + enc_output_flag=True, + data_shape_flag=True, + slf_attn_shape_flag=True, + src_attn_shape_flag=True) else: - trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape = dec_inputs + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ + trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape = dec_inputs dec_input = prepare_decoder( trg_word, @@ -604,13 +714,17 @@ def wrap_decoder(trg_vocab_size, d_value, d_model, d_inner_hid, - dropout_rate, ) - + dropout_rate, + slf_attn_pre_softmax_shape, + slf_attn_post_softmax_shape, + src_attn_pre_softmax_shape, + src_attn_post_softmax_shape, ) + # Return logits for training and probs for inference. predict = layers.reshape( x=layers.fc(input=dec_output, size=trg_vocab_size, bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], - act="softmax") + act="softmax" if dec_inputs is None else None) return predict diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index b00552333aed71f6be09c238f4d42456bd9d20ee..83098e9b8720ef9652f4071f37c1fb7af83109c0 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -1,7 +1,7 @@ import os import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid from model import transformer, position_encoding_init @@ -66,16 +66,35 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]).astype("float32") + + # These shape tensors are used in reshape_op. + src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32") + trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32") + src_slf_attn_pre_softmax_shape = np.array( + [-1, src_slf_attn_bias.shape[-1]], dtype="int32") + src_slf_attn_post_softmax_shape = np.array( + src_slf_attn_bias.shape, dtype="int32") + trg_slf_attn_pre_softmax_shape = np.array( + [-1, trg_slf_attn_bias.shape[-1]], dtype="int32") + trg_slf_attn_post_softmax_shape = np.array( + trg_slf_attn_bias.shape, dtype="int32") + trg_src_attn_pre_softmax_shape = np.array( + [-1, trg_src_attn_bias.shape[-1]], dtype="int32") + trg_src_attn_post_softmax_shape = np.array( + trg_src_attn_bias.shape, dtype="int32") + lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head, False, False, False, False) lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) - src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32") - trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32") + input_dict = dict( zip(input_data_names, [ - src_word, src_pos, src_slf_attn_bias, src_data_shape, trg_word, - trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_data_shape, - lbl_word, lbl_weight + src_word, src_pos, src_slf_attn_bias, src_data_shape, + src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape, + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, + trg_data_shape, trg_slf_attn_pre_softmax_shape, + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, + trg_src_attn_post_softmax_shape, lbl_word, lbl_weight ])) return input_dict @@ -122,6 +141,10 @@ def main(): def test(exe): test_costs = [] for batch_id, data in enumerate(val_data()): + if len(data) != TrainTaskConfig.batch_size: + # Since we use the sum cost, keep comparable cost by fixing the + # batch size. Remove this if the cost is mean. + continue data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.src_pad_idx, diff --git a/fluid/object_detection/image_util.py b/fluid/object_detection/image_util.py index ba8744eda0a078acd38cad9b10ca7511185efc43..e538449aa9f9ce15e7730de293a98fe753403a87 100644 --- a/fluid/object_detection/image_util.py +++ b/fluid/object_detection/image_util.py @@ -1,4 +1,4 @@ -from PIL import Image +from PIL import Image, ImageEnhance import numpy as np import random import math @@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): sample_img = img[ymin:ymax, xmin:xmax] sample_labels = transform_labels(bbox_labels, sample_bbox) return sample_img, sample_labels + + +def random_brightness(img, settings): + prob = random.uniform(0, 1) + if prob < settings._brightness_prob: + delta = random.uniform(-settings._brightness_delta, + settings._brightness_delta) + 1 + img = ImageEnhance.Brightness(img).enhance(delta) + return img + + +def random_contrast(img, settings): + prob = random.uniform(0, 1) + if prob < settings._contrast_prob: + delta = random.uniform(-settings._contrast_delta, + settings._contrast_delta) + 1 + img = ImageEnhance.Contrast(img).enhance(delta) + return img + + +def random_saturation(img, settings): + prob = random.uniform(0, 1) + if prob < settings._saturation_prob: + delta = random.uniform(-settings._saturation_delta, + settings._saturation_delta) + 1 + img = ImageEnhance.Color(img).enhance(delta) + return img + + +def random_hue(img, settings): + prob = random.uniform(0, 1) + if prob < settings._hue_prob: + delta = random.uniform(-settings._hue_delta, settings._hue_delta) + img_hsv = np.array(img.convert('HSV')) + img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta + img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') + return img + + +def distort_image(img, settings): + prob = random.uniform(0, 1) + # Apply different distort order + if prob > 0.5: + img = random_brightness(img, settings) + img = random_contrast(img, settings) + img = random_saturation(img, settings) + img = random_hue(img, settings) + else: + img = random_brightness(img, settings) + img = random_saturation(img, settings) + img = random_hue(img, settings) + img = random_contrast(img, settings) + return img + + +def expand_image(img, bbox_labels, img_width, img_height, settings): + prob = random.uniform(0, 1) + if prob < settings._expand_prob: + expand_ratio = random.uniform(1, settings._expand_max_ratio) + if expand_ratio - 1 >= 0.01: + height = int(img_height * expand_ratio) + width = int(img_width * expand_ratio) + h_off = math.floor(random.uniform(0, height - img_height)) + w_off = math.floor(random.uniform(0, width - img_width)) + expand_bbox = bbox(-w_off / img_width, -h_off / img_height, + (width - w_off) / img_width, + (height - h_off) / img_height) + expand_img = np.ones((height, width, 3)) + expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean)) + expand_img = Image.fromarray(expand_img) + expand_img.paste(img, (int(w_off), int(h_off))) + bbox_labels = transform_labels(bbox_labels, expand_bbox) + return expand_img, bbox_labels + return img, bbox_labels diff --git a/fluid/object_detection/reader.py b/fluid/object_detection/reader.py index 4e680c29997b432c14b92ea641aa9f956de41292..6a6beb6e50f5b0a7f6b969ca53868178db2527a6 100644 --- a/fluid/object_detection/reader.py +++ b/fluid/object_detection/reader.py @@ -22,17 +22,38 @@ import os class Settings(object): - def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value): + def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value, + apply_distort, apply_expand): self._data_dir = data_dir self._label_list = [] label_fpath = os.path.join(data_dir, label_file) for line in open(label_fpath): self._label_list.append(line.strip()) + self._apply_distort = apply_distort + self._apply_expand = apply_expand self._resize_height = resize_h self._resize_width = resize_w self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype( 'float32') + self._expand_prob = 0.5 + self._expand_max_ratio = 4 + self._hue_prob = 0.5 + self._hue_delta = 18 + self._contrast_prob = 0.5 + self._contrast_delta = 0.5 + self._saturation_prob = 0.5 + self._saturation_delta = 0.5 + self._brightness_prob = 0.5 + self._brightness_delta = 0.125 + + @property + def apply_distort(self): + return self._apply_expand + + @property + def apply_distort(self): + return self._apply_distort @property def data_dir(self): @@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle): img = Image.open(img_path) img_width, img_height = img.size - img = np.array(img) # layout: label | xmin | ymin | xmax | ymax | difficult if mode == 'train' or mode == 'test': @@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle): sample_labels = bbox_labels if mode == 'train': + if settings._apply_distort: + img = image_util.distort_image(img, settings) + if settings._apply_expand: + img, bbox_labels = image_util.expand_image( + img, bbox_labels, img_width, img_height, + settings) batch_sampler = [] # hard-code here batch_sampler.append( @@ -126,13 +152,14 @@ def _reader_creator(settings, file_list, mode, shuffle): sampled_bbox = image_util.generate_batch_samples( batch_sampler, bbox_labels, img_width, img_height) + img = np.array(img) if len(sampled_bbox) > 0: idx = int(random.uniform(0, len(sampled_bbox))) img, sample_labels = image_util.crop_image( img, bbox_labels, sampled_bbox[idx], img_width, img_height) - img = Image.fromarray(img) + img = Image.fromarray(img) img = img.resize((settings.resize_w, settings.resize_h), Image.ANTIALIAS) img = np.array(img) diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index ed826a8eebaf9487c4b7e7e28427beaf7760e101..a6c8e9e273cb2e08fc789acfdf9f92cb4e70f341 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -12,8 +12,9 @@ import functools parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('parallel', bool, True, "Whether use parallel training.") -add_arg('use_gpu', bool, True, "Whether use GPU.") +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('parallel', bool, True, "Whether use parallel training.") +add_arg('use_gpu', bool, True, "Whether use GPU.") # yapf: disable @@ -47,26 +48,24 @@ def train(args, locs, confs, box, box_var = mobile_net(image_, image_shape) loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box, box_var) + nmsed_out = fluid.layers.detection_output( + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.reduce_sum(loss) pd.write_output(loss) - pd.write_output(locs) - pd.write_output(confs) - pd.write_output(box) - pd.write_output(box_var) + pd.write_output(nmsed_out) - loss, locs, confs, box, box_var = pd() - loss = fluid.layers.reduce_sum(loss) + loss, nmsed_out = pd() + loss = fluid.layers.mean(loss) else: locs, confs, box, box_var = mobile_net(image, image_shape) nmsed_out = fluid.layers.detection_output( - locs, mbox_confs, box, box_var, nms_threshold=0.45) - loss = fluid.layers.ssd_loss(locs, mbox_confs, gt_box, gt_label, + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(test_program): - nmsed_out = fluid.layers.detection_output( - locs, confs, box, box_var, nms_threshold=0.45) map_eval = fluid.evaluator.DetectionMAP( nmsed_out, gt_label, @@ -77,13 +76,10 @@ def train(args, evaluate_difficult=False, ap_version='11point') - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.exponential_decay( - learning_rate=learning_rate, - decay_steps=40000, - decay_rate=0.1, - staircase=True), - momentum=0.9, + boundaries = [40000, 60000] + values = [0.001, 0.0005, 0.00025] + optimizer = fluid.optimizer.RMSProp( + learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) optimizer.minimize(loss) @@ -92,7 +88,8 @@ def train(args, exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - load_model.load_paddlev1_vars(place) + load_model.load_and_set_vars(place) + #load_model.load_paddlev1_vars(place) train_reader = paddle.batch( reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch( @@ -100,7 +97,6 @@ def train(args, feeder = fluid.DataFeeder( place=place, feed_list=[image, gt_box, gt_label, difficult]) - #print 'test_program ', test_program def test(pass_id): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) @@ -111,14 +107,14 @@ def train(args, fetch_list=[accum_map]) print("Test {0}, map {1}".format(pass_id, test_map[0])) - #print 'main_program ', fluid.default_main_program() for pass_id in range(num_passes): for batch_id, data in enumerate(train_reader()): loss_v = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) - print("Pass {0}, batch {1}, loss {2}" - .format(pass_id, batch_id, loss_v[0])) + if batch_id % 20 == 0: + print("Pass {0}, batch {1}, loss {2}" + .format(pass_id, batch_id, loss_v[0])) test(pass_id) if pass_id % 10 == 0: @@ -134,6 +130,8 @@ if __name__ == '__main__': data_args = reader.Settings( data_dir='./data', label_file='label_list', + apply_distort=True, + apply_expand=True, resize_h=300, resize_w=300, mean_value=[127.5, 127.5, 127.5]) @@ -142,5 +140,5 @@ if __name__ == '__main__': val_file_list='./data/test.txt', data_args=data_args, learning_rate=0.001, - batch_size=32, + batch_size=args.batch_size, num_passes=300) diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py index 73616ecb36ca2661eb8e4898caf34fc2d91b9bdc..dd5aaa3f94c1e2668ec75d30735640d14ee8ef0e 100644 --- a/fluid/ocr_recognition/crnn_ctc_model.py +++ b/fluid/ocr_recognition/crnn_ctc_model.py @@ -187,15 +187,18 @@ def ctc_train_net(images, label, args, num_classes): error_evaluator = fluid.evaluator.EditDistance( input=decoded_out, label=casted_label) - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program(error_evaluator) + inference_program = fluid.default_main_program().clone(for_test=True) optimizer = fluid.optimizer.Momentum( learning_rate=args.learning_rate, momentum=args.momentum) _, params_grads = optimizer.minimize(sum_cost) + model_average = fluid.optimizer.ModelAverage( + params_grads, + args.average_window, + min_average_window=args.min_average_window, + max_average_window=args.max_average_window) - return sum_cost, error_evaluator, inference_program + return sum_cost, error_evaluator, inference_program, model_average def ctc_infer(images, num_classes): diff --git a/fluid/ocr_recognition/ctc_reader.py b/fluid/ocr_recognition/ctc_reader.py index f095c9b3bb7cdf36c247cca1c93ea7d636b91d24..c9f75a0d523a7390b3814706cdad831d5900dbdb 100644 --- a/fluid/ocr_recognition/ctc_reader.py +++ b/fluid/ocr_recognition/ctc_reader.py @@ -1,14 +1,25 @@ import os import cv2 +import tarfile import numpy as np from PIL import Image - +from os import path from paddle.v2.image import load_image import paddle.v2 as paddle NUM_CLASSES = 10784 DATA_SHAPE = [1, 48, 512] +DATA_MD5 = "1de60d54d19632022144e4e58c2637b5" +DATA_URL = "http://cloud.dlnel.org/filepub/?uuid=df937251-3c0b-480d-9a7b-0080dfeee65c" +CACHE_DIR_NAME = "ctc_data" +SAVED_FILE_NAME = "data.tar.gz" +DATA_DIR_NAME = "data" +TRAIN_DATA_DIR_NAME = "train_images" +TEST_DATA_DIR_NAME = "test_images" +TRAIN_LIST_FILE_NAME = "train.list" +TEST_LIST_FILE_NAME = "test.list" + class DataGenerator(object): def __init__(self): @@ -102,25 +113,42 @@ class DataGenerator(object): def num_classes(): + '''Get classes number of this dataset. + ''' return NUM_CLASSES def data_shape(): + '''Get image shape of this dataset. It is a dummy shape for this dataset. + ''' return DATA_SHAPE def train(batch_size): generator = DataGenerator() + data_dir = download_data() return generator.train_reader( - "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train_images/", - "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train.list", - batch_size) + path.join(data_dir, TRAIN_DATA_DIR_NAME), + path.join(data_dir, TRAIN_LIST_FILE_NAME), batch_size) def test(batch_size=1): generator = DataGenerator() + data_dir = download_data() return paddle.batch( generator.test_reader( - "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test_images/", - "/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test.list" - ), batch_size) + path.join(data_dir, TRAIN_DATA_DIR_NAME), + path.join(data_dir, TRAIN_LIST_FILE_NAME)), batch_size) + + +def download_data(): + '''Download train and test data. + ''' + tar_file = paddle.dataset.common.download( + DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME) + data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME) + if not path.isdir(data_dir): + t = tarfile.open(tar_file, "r:gz") + t.extractall(path=path.dirname(tar_file)) + t.close() + return data_dir diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py index c2d8fd26bbdeb3ad5c9fb2c1ade3b2b22a0dfd44..2ac23f609779c5e653919fece6dbf661c79e859f 100644 --- a/fluid/ocr_recognition/ctc_train.py +++ b/fluid/ocr_recognition/ctc_train.py @@ -8,6 +8,7 @@ import functools import sys from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data from crnn_ctc_model import ctc_train_net +import time parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -23,7 +24,10 @@ add_arg('momentum', float, 0.9, "Momentum.") add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.") add_arg('device', int, 0, "Device id.'-1' means running on CPU" "while '0' means GPU-0.") -add_arg('parallel', bool, True, "Whether use parallel training.") +add_arg('min_average_window', int, 10000, "Min average window.") +add_arg('max_average_window', int, 15625, "Max average window.") +add_arg('average_window', float, 0.15, "Average window.") +add_arg('parallel', bool, False, "Whether use parallel training.") # yapf: disable def load_parameter(place): @@ -40,7 +44,7 @@ def train(args, data_reader=dummy_reader): # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1) - sum_cost, error_evaluator, inference_program = ctc_train_net(images, label, args, num_classes) + sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(images, label, args, num_classes) # data reader train_reader = data_reader.train(args.batch_size) @@ -66,21 +70,24 @@ def train(args, data_reader=dummy_reader): fetch_list=[sum_cost] + error_evaluator.metrics) total_loss += batch_loss[0] total_seq_error += batch_seq_error[0] - if batch_id % 10 == 1: + if batch_id % 100 == 1: print '.', sys.stdout.flush() if batch_id % args.log_period == 1: - print "\nPass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % ( + print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % ( + time.time(), pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size)) sys.stdout.flush() batch_id += 1 - error_evaluator.reset(exe) - for data in test_reader(): - exe.run(inference_program, feed=get_feeder_data(data, place)) - _, test_seq_error = error_evaluator.eval(exe) - print "\nEnd pass[%d]; Test seq error: %s.\n" % ( - pass_id, str(test_seq_error[0])) + with model_average.apply(exe): + error_evaluator.reset(exe) + for data in test_reader(): + exe.run(inference_program, feed=get_feeder_data(data, place)) + _, test_seq_error = error_evaluator.eval(exe) + + print "\nEnd pass[%d]; Test seq error: %s.\n" % ( + pass_id, str(test_seq_error[0])) def main(): args = parser.parse_args()